This commit is contained in:
overcuriousity
2025-09-12 23:54:06 +02:00
parent 2d62191aa0
commit 03c52abd1b
8 changed files with 819 additions and 223 deletions

View File

@@ -163,11 +163,11 @@ class BaseProvider(ABC):
target_indicator: str = "",
max_retries: int = 3) -> Optional[requests.Response]:
"""
Make a rate-limited HTTP request with forensic logging and retry logic.
Now supports cancellation via stop_event from scanner.
Make a rate-limited HTTP request with aggressive stop signal handling.
Terminates immediately when stop is requested, including during retries.
"""
# Check for cancellation before starting
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
if self._is_stop_requested():
print(f"Request cancelled before start: {url}")
return None
@@ -188,21 +188,24 @@ class BaseProvider(ABC):
response.headers = cached_data['headers']
return response
for attempt in range(max_retries + 1):
# Check for cancellation before each attempt
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
# Determine effective max_retries based on stop signal
effective_max_retries = 0 if self._is_stop_requested() else max_retries
last_exception = None
for attempt in range(effective_max_retries + 1):
# AGGRESSIVE: Check for cancellation before each attempt
if self._is_stop_requested():
print(f"Request cancelled during attempt {attempt + 1}: {url}")
return None
# Apply rate limiting (but reduce wait time if cancellation is requested)
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
break
self.rate_limiter.wait_if_needed()
# Apply rate limiting with cancellation awareness
if not self._wait_with_cancellation_check():
print(f"Request cancelled during rate limiting: {url}")
return None
# Check again after rate limiting
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
print(f"Request cancelled after rate limiting: {url}")
# AGGRESSIVE: Final check before making HTTP request
if self._is_stop_requested():
print(f"Request cancelled before HTTP call: {url}")
return None
start_time = time.time()
@@ -219,10 +222,11 @@ class BaseProvider(ABC):
print(f"Making {method} request to: {url} (attempt {attempt + 1})")
# Use shorter timeout if termination is requested
# AGGRESSIVE: Use much shorter timeout if termination is requested
request_timeout = self.timeout
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
request_timeout = min(5, self.timeout) # Max 5 seconds if termination requested
if self._is_stop_requested():
request_timeout = 2 # Max 2 seconds if termination requested
print(f"Stop requested - using short timeout: {request_timeout}s")
# Make request
if method.upper() == "GET":
@@ -271,28 +275,28 @@ class BaseProvider(ABC):
error = str(e)
self.failed_requests += 1
print(f"Request failed (attempt {attempt + 1}): {error}")
last_exception = e
# Check for cancellation before retrying
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
print(f"Request cancelled, not retrying: {url}")
# AGGRESSIVE: Immediately abort retries if stop requested
if self._is_stop_requested():
print(f"Stop requested - aborting retries for: {url}")
break
# Check if we should retry
if attempt < max_retries and self._should_retry(e):
backoff_time = (2 ** attempt) * 1 # Exponential backoff: 1s, 2s, 4s
print(f"Retrying in {backoff_time} seconds...")
# Check if we should retry (but only if stop not requested)
if attempt < effective_max_retries and self._should_retry(e):
# Use a longer, more respectful backoff for 429 errors
if isinstance(e, requests.exceptions.HTTPError) and e.response and e.response.status_code == 429:
# Start with a 10-second backoff and increase exponentially
backoff_time = 10 * (2 ** attempt)
print(f"Rate limit hit. Retrying in {backoff_time} seconds...")
else:
backoff_time = min(1.0, (2 ** attempt) * 0.5) # Shorter backoff for other errors
print(f"Retrying in {backoff_time} seconds...")
# Shorter backoff if termination is requested
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
backoff_time = min(0.5, backoff_time)
# Sleep with cancellation checking
sleep_start = time.time()
while time.time() - sleep_start < backoff_time:
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
print(f"Request cancelled during backoff: {url}")
return None
time.sleep(0.1) # Check every 100ms
# AGGRESSIVE: Much shorter backoff and more frequent checking
if not self._sleep_with_cancellation_check(backoff_time):
print(f"Stop requested during backoff - aborting: {url}")
return None
continue
else:
break
@@ -301,6 +305,7 @@ class BaseProvider(ABC):
error = f"Unexpected error: {str(e)}"
self.failed_requests += 1
print(f"Unexpected error: {error}")
last_exception = e
break
# All attempts failed - log and return None
@@ -316,8 +321,57 @@ class BaseProvider(ABC):
target_indicator=target_indicator
)
if error and last_exception:
raise last_exception
return None
def _is_stop_requested(self) -> bool:
"""
Enhanced stop signal checking that handles both local and Redis-based signals.
"""
if hasattr(self, '_stop_event') and self._stop_event and self._stop_event.is_set():
return True
return False
def _wait_with_cancellation_check(self) -> bool:
"""
Wait for rate limiting while aggressively checking for cancellation.
Returns False if cancelled during wait.
"""
current_time = time.time()
time_since_last = current_time - self.rate_limiter.last_request_time
if time_since_last < self.rate_limiter.min_interval:
sleep_time = self.rate_limiter.min_interval - time_since_last
if not self._sleep_with_cancellation_check(sleep_time):
return False
self.rate_limiter.last_request_time = time.time()
return True
def _sleep_with_cancellation_check(self, sleep_time: float) -> bool:
"""
Sleep for the specified time while aggressively checking for cancellation.
Args:
sleep_time: Time to sleep in seconds
Returns:
bool: True if sleep completed, False if cancelled
"""
sleep_start = time.time()
check_interval = 0.05 # Check every 50ms for aggressive responsiveness
while time.time() - sleep_start < sleep_time:
if self._is_stop_requested():
return False
remaining_time = sleep_time - (time.time() - sleep_start)
time.sleep(min(check_interval, remaining_time))
return True
def set_stop_event(self, stop_event: threading.Event) -> None:
"""
Set the stop event for this provider to enable cancellation.
@@ -337,15 +391,15 @@ class BaseProvider(ABC):
Returns:
True if the request should be retried
"""
# Retry on connection errors, timeouts, and 5xx server errors
# Retry on connection errors and timeouts
if isinstance(exception, (requests.exceptions.ConnectionError,
requests.exceptions.Timeout)):
return True
if isinstance(exception, requests.exceptions.HTTPError):
if hasattr(exception, 'response') and exception.response:
# Retry on server errors (5xx) but not client errors (4xx)
return exception.response.status_code >= 500
# Retry on server errors (5xx) AND on rate-limiting errors (429)
return exception.response.status_code >= 500 or exception.response.status_code == 429
return False

View File

@@ -157,8 +157,7 @@ class CrtShProvider(BaseProvider):
def query_domain(self, domain: str) -> List[Tuple[str, str, RelationshipType, float, Dict[str, Any]]]:
"""
Query crt.sh for certificates containing the domain.
Creates domain-to-domain relationships and stores certificate data as metadata.
Now supports early termination via stop_event.
Enhanced with more frequent stop signal checking for reliable termination.
"""
if not _is_valid_domain(domain):
return []
@@ -197,10 +196,10 @@ class CrtShProvider(BaseProvider):
domain_certificates = {}
all_discovered_domains = set()
# Process certificates and group by domain (with cancellation checks)
# Process certificates with enhanced cancellation checking
for i, cert_data in enumerate(certificates):
# Check for cancellation every 10 certificates
if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
# Check for cancellation every 5 certificates instead of 10 for faster response
if i % 5 == 0 and self._stop_event and self._stop_event.is_set():
print(f"CrtSh processing cancelled at certificate {i} for domain: {domain}")
break
@@ -209,6 +208,11 @@ class CrtShProvider(BaseProvider):
# Add all domains from this certificate to our tracking
for cert_domain in cert_domains:
# Additional stop check during domain processing
if i % 20 == 0 and self._stop_event and self._stop_event.is_set():
print(f"CrtSh domain processing cancelled for domain: {domain}")
break
if not _is_valid_domain(cert_domain):
continue
@@ -226,13 +230,13 @@ class CrtShProvider(BaseProvider):
print(f"CrtSh query cancelled before relationship creation for domain: {domain}")
return []
# Create relationships from query domain to ALL discovered domains
for discovered_domain in all_discovered_domains:
# Create relationships from query domain to ALL discovered domains with stop checking
for i, discovered_domain in enumerate(all_discovered_domains):
if discovered_domain == domain:
continue # Skip self-relationships
# Check for cancellation during relationship creation
if self._stop_event and self._stop_event.is_set():
# Check for cancellation every 10 relationships
if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
print(f"CrtSh relationship creation cancelled for domain: {domain}")
break
@@ -284,8 +288,6 @@ class CrtShProvider(BaseProvider):
except json.JSONDecodeError as e:
self.logger.logger.error(f"Failed to parse JSON response from crt.sh: {e}")
except Exception as e:
self.logger.logger.error(f"Error querying crt.sh for {domain}: {e}")
return relationships

View File

@@ -134,8 +134,6 @@ class ShodanProvider(BaseProvider):
except json.JSONDecodeError as e:
self.logger.logger.error(f"Failed to parse JSON response from Shodan: {e}")
except Exception as e:
self.logger.logger.error(f"Error querying Shodan for domain {domain}: {e}")
return relationships
@@ -231,9 +229,7 @@ class ShodanProvider(BaseProvider):
except json.JSONDecodeError as e:
self.logger.logger.error(f"Failed to parse JSON response from Shodan: {e}")
except Exception as e:
self.logger.logger.error(f"Error querying Shodan for IP {ip}: {e}")
return relationships
def search_by_organization(self, org_name: str) -> List[Dict[str, Any]]: