it

2025-09-12 23:54:06 +02:00
parent 2d62191aa0
commit 03c52abd1b
8 changed files with 819 additions and 223 deletions
--- a/providers/base_provider.py
+++ b/providers/base_provider.py
@@ -163,11 +163,11 @@ class BaseProvider(ABC):
                    target_indicator: str = "",
                    max_retries: int = 3) -> Optional[requests.Response]:
        """
-        Make a rate-limited HTTP request with forensic logging and retry logic.
-        Now supports cancellation via stop_event from scanner.
+        Make a rate-limited HTTP request with aggressive stop signal handling.
+        Terminates immediately when stop is requested, including during retries.
        """
        # Check for cancellation before starting
-        if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
+        if self._is_stop_requested():
            print(f"Request cancelled before start: {url}")
            return None

@@ -188,21 +188,24 @@ class BaseProvider(ABC):
                    response.headers = cached_data['headers']
                    return response

-        for attempt in range(max_retries + 1):
-            # Check for cancellation before each attempt
-            if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
+        # Determine effective max_retries based on stop signal
+        effective_max_retries = 0 if self._is_stop_requested() else max_retries
+        last_exception = None
+
+        for attempt in range(effective_max_retries + 1):
+            # AGGRESSIVE: Check for cancellation before each attempt
+            if self._is_stop_requested():
                print(f"Request cancelled during attempt {attempt + 1}: {url}")
                return None

-            # Apply rate limiting (but reduce wait time if cancellation is requested)
-            if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
-                break
-            
-            self.rate_limiter.wait_if_needed()
+            # Apply rate limiting with cancellation awareness
+            if not self._wait_with_cancellation_check():
+                print(f"Request cancelled during rate limiting: {url}")
+                return None

-            # Check again after rate limiting
-            if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
-                print(f"Request cancelled after rate limiting: {url}")
+            # AGGRESSIVE: Final check before making HTTP request
+            if self._is_stop_requested():
+                print(f"Request cancelled before HTTP call: {url}")
                return None

            start_time = time.time()
@@ -219,10 +222,11 @@ class BaseProvider(ABC):

                print(f"Making {method} request to: {url} (attempt {attempt + 1})")

-                # Use shorter timeout if termination is requested
+                # AGGRESSIVE: Use much shorter timeout if termination is requested
                request_timeout = self.timeout
-                if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
-                    request_timeout = min(5, self.timeout)  # Max 5 seconds if termination requested
+                if self._is_stop_requested():
+                    request_timeout = 2  # Max 2 seconds if termination requested
+                    print(f"Stop requested - using short timeout: {request_timeout}s")

                # Make request
                if method.upper() == "GET":
@@ -271,28 +275,28 @@ class BaseProvider(ABC):
                error = str(e)
                self.failed_requests += 1
                print(f"Request failed (attempt {attempt + 1}): {error}")
+                last_exception = e
                
-                # Check for cancellation before retrying
-                if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
-                    print(f"Request cancelled, not retrying: {url}")
+                # AGGRESSIVE: Immediately abort retries if stop requested
+                if self._is_stop_requested():
+                    print(f"Stop requested - aborting retries for: {url}")
                    break
                
-                # Check if we should retry
-                if attempt < max_retries and self._should_retry(e):
-                    backoff_time = (2 ** attempt) * 1  # Exponential backoff: 1s, 2s, 4s
-                    print(f"Retrying in {backoff_time} seconds...")
+                # Check if we should retry (but only if stop not requested)
+                if attempt < effective_max_retries and self._should_retry(e):
+                    # Use a longer, more respectful backoff for 429 errors
+                    if isinstance(e, requests.exceptions.HTTPError) and e.response and e.response.status_code == 429:
+                        # Start with a 10-second backoff and increase exponentially
+                        backoff_time = 10 * (2 ** attempt)
+                        print(f"Rate limit hit. Retrying in {backoff_time} seconds...")
+                    else:
+                        backoff_time = min(1.0, (2 ** attempt) * 0.5)  # Shorter backoff for other errors
+                        print(f"Retrying in {backoff_time} seconds...")
                    
-                    # Shorter backoff if termination is requested
-                    if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
-                        backoff_time = min(0.5, backoff_time)
-                    
-                    # Sleep with cancellation checking
-                    sleep_start = time.time()
-                    while time.time() - sleep_start < backoff_time:
-                        if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
-                            print(f"Request cancelled during backoff: {url}")
-                            return None
-                        time.sleep(0.1)  # Check every 100ms
+                    # AGGRESSIVE: Much shorter backoff and more frequent checking
+                    if not self._sleep_with_cancellation_check(backoff_time):
+                        print(f"Stop requested during backoff - aborting: {url}")
+                        return None
                    continue
                else:
                    break
@@ -301,6 +305,7 @@ class BaseProvider(ABC):
                error = f"Unexpected error: {str(e)}"
                self.failed_requests += 1
                print(f"Unexpected error: {error}")
+                last_exception = e
                break

        # All attempts failed - log and return None
@@ -316,8 +321,57 @@ class BaseProvider(ABC):
            target_indicator=target_indicator
        )
        
+        if error and last_exception:
+            raise last_exception
+        
        return None

+    def _is_stop_requested(self) -> bool:
+        """
+        Enhanced stop signal checking that handles both local and Redis-based signals.
+        """
+        if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
+            return True
+        return False
+
+
+    def _wait_with_cancellation_check(self) -> bool:
+        """
+        Wait for rate limiting while aggressively checking for cancellation.
+        Returns False if cancelled during wait.
+        """
+        current_time = time.time()
+        time_since_last = current_time - self.rate_limiter.last_request_time
+
+        if time_since_last < self.rate_limiter.min_interval:
+            sleep_time = self.rate_limiter.min_interval - time_since_last
+            if not self._sleep_with_cancellation_check(sleep_time):
+                return False
+
+        self.rate_limiter.last_request_time = time.time()
+        return True
+
+    def _sleep_with_cancellation_check(self, sleep_time: float) -> bool:
+        """
+        Sleep for the specified time while aggressively checking for cancellation.
+        
+        Args:
+            sleep_time: Time to sleep in seconds
+            
+        Returns:
+            bool: True if sleep completed, False if cancelled
+        """
+        sleep_start = time.time()
+        check_interval = 0.05  # Check every 50ms for aggressive responsiveness
+        
+        while time.time() - sleep_start < sleep_time:
+            if self._is_stop_requested():
+                return False
+            remaining_time = sleep_time - (time.time() - sleep_start)
+            time.sleep(min(check_interval, remaining_time))
+        
+        return True
+
    def set_stop_event(self, stop_event: threading.Event) -> None:
        """
        Set the stop event for this provider to enable cancellation.
@@ -337,15 +391,15 @@ class BaseProvider(ABC):
        Returns:
            True if the request should be retried
        """
-        # Retry on connection errors, timeouts, and 5xx server errors
+        # Retry on connection errors and timeouts
        if isinstance(exception, (requests.exceptions.ConnectionError, 
                                requests.exceptions.Timeout)):
            return True
        
        if isinstance(exception, requests.exceptions.HTTPError):
            if hasattr(exception, 'response') and exception.response:
-                # Retry on server errors (5xx) but not client errors (4xx)
-                return exception.response.status_code >= 500
+                # Retry on server errors (5xx) AND on rate-limiting errors (429)
+                return exception.response.status_code >= 500 or exception.response.status_code == 429
        
        return False

--- a/providers/crtsh_provider.py
+++ b/providers/crtsh_provider.py
@@ -157,8 +157,7 @@ class CrtShProvider(BaseProvider):
    def query_domain(self, domain: str) -> List[Tuple[str, str, RelationshipType, float, Dict[str, Any]]]:
        """
        Query crt.sh for certificates containing the domain.
-        Creates domain-to-domain relationships and stores certificate data as metadata.
-        Now supports early termination via stop_event.
+        Enhanced with more frequent stop signal checking for reliable termination.
        """
        if not _is_valid_domain(domain):
            return []
@@ -197,10 +196,10 @@ class CrtShProvider(BaseProvider):
            domain_certificates = {}
            all_discovered_domains = set()
            
-            # Process certificates and group by domain (with cancellation checks)
+            # Process certificates with enhanced cancellation checking
            for i, cert_data in enumerate(certificates):
-                # Check for cancellation every 10 certificates
-                if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
+                # Check for cancellation every 5 certificates instead of 10 for faster response
+                if i % 5 == 0 and self._stop_event and self._stop_event.is_set():
                    print(f"CrtSh processing cancelled at certificate {i} for domain: {domain}")
                    break
                    
@@ -209,6 +208,11 @@ class CrtShProvider(BaseProvider):
                
                # Add all domains from this certificate to our tracking
                for cert_domain in cert_domains:
+                    # Additional stop check during domain processing
+                    if i % 20 == 0 and self._stop_event and self._stop_event.is_set():
+                        print(f"CrtSh domain processing cancelled for domain: {domain}")
+                        break
+                        
                    if not _is_valid_domain(cert_domain):
                        continue
                    
@@ -226,13 +230,13 @@ class CrtShProvider(BaseProvider):
                print(f"CrtSh query cancelled before relationship creation for domain: {domain}")
                return []

-            # Create relationships from query domain to ALL discovered domains
-            for discovered_domain in all_discovered_domains:
+            # Create relationships from query domain to ALL discovered domains with stop checking
+            for i, discovered_domain in enumerate(all_discovered_domains):
                if discovered_domain == domain:
                    continue  # Skip self-relationships
                
-                # Check for cancellation during relationship creation
-                if self._stop_event and self._stop_event.is_set():
+                # Check for cancellation every 10 relationships
+                if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
                    print(f"CrtSh relationship creation cancelled for domain: {domain}")
                    break

@@ -284,8 +288,6 @@ class CrtShProvider(BaseProvider):

        except json.JSONDecodeError as e:
            self.logger.logger.error(f"Failed to parse JSON response from crt.sh: {e}")
-        except Exception as e:
-            self.logger.logger.error(f"Error querying crt.sh for {domain}: {e}")
        
        return relationships

--- a/providers/shodan_provider.py
+++ b/providers/shodan_provider.py
@@ -134,8 +134,6 @@ class ShodanProvider(BaseProvider):
        
        except json.JSONDecodeError as e:
            self.logger.logger.error(f"Failed to parse JSON response from Shodan: {e}")
-        except Exception as e:
-            self.logger.logger.error(f"Error querying Shodan for domain {domain}: {e}")
        
        return relationships
    
@@ -231,9 +229,7 @@ class ShodanProvider(BaseProvider):
        
        except json.JSONDecodeError as e:
            self.logger.logger.error(f"Failed to parse JSON response from Shodan: {e}")
-        except Exception as e:
-            self.logger.logger.error(f"Error querying Shodan for IP {ip}: {e}")
-        
+
        return relationships
    
    def search_by_organization(self, org_name: str) -> List[Dict[str, Any]]: