From 173c3dcf92ef29d25466420d6c6384e0f1bd2597 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Wed, 17 Sep 2025 17:10:11 +0200 Subject: [PATCH] some adjustments for clarity --- app.py | 15 ++++--- core/graph_manager.py | 2 +- core/scanner.py | 16 +++++-- providers/crtsh_provider.py | 86 +++++++++++++++++++----------------- providers/dns_provider.py | 6 +-- providers/shodan_provider.py | 12 ++--- 6 files changed, 77 insertions(+), 60 deletions(-) diff --git a/app.py b/app.py index fceb61a..d21e77a 100644 --- a/app.py +++ b/app.py @@ -463,12 +463,15 @@ def get_providers(): # Get user-specific scanner user_session_id, scanner = get_user_scanner() - if scanner: - # Updated debug print to be consistent with the new progress bar logic - completed_tasks = scanner.indicators_completed - total_tasks = scanner.total_tasks_ever_enqueued - print(f"DEBUG: Task Progress - Completed: {completed_tasks}, Total Enqueued: {total_tasks}") - else: + if scanner and scanner.status == 'running': + status = scanner.get_scan_status() + currently_processing = status.get('currently_processing') + if currently_processing: + provider_name, target_item = currently_processing[0] + print(f"DEBUG: RUNNING Task - Provider: {provider_name}, Target: {target_item}") + + print(f"DEBUG: Task Queue Status - In Queue: {status.get('tasks_in_queue', 0)}, Completed: {status.get('tasks_completed', 0)}, Skipped: {status.get('tasks_skipped', 0)}, Rescheduled: {status.get('tasks_rescheduled', 0)}") + elif not scanner: print("DEBUG: No active scanner session found.") provider_info = scanner.get_provider_info() diff --git a/core/graph_manager.py b/core/graph_manager.py index 0d21d2d..5a55052 100644 --- a/core/graph_manager.py +++ b/core/graph_manager.py @@ -41,7 +41,7 @@ class GraphManager: self.correlation_index = {} # Compile regex for date filtering for efficiency self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}') - self.EXCLUDED_KEYS = ['confidence', 'provider', 'timestamp', 'type','cert_validity_period_days','cert_source'] + self.EXCLUDED_KEYS = ['crtsh_cert_validity_period_days','crtsh_cert_source','crtsh_cert_common_name'] def __getstate__(self): """Prepare GraphManager for pickling, excluding compiled regex.""" diff --git a/core/scanner.py b/core/scanner.py index 55d6b1c..bc5f23a 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -396,7 +396,7 @@ class Scanner: if self._is_stop_requested(): print(f"Stop requested before processing {target_item}") break - self.currently_processing.add(target_item) + self.currently_processing.add(task_tuple) try: self.current_depth = depth @@ -444,7 +444,7 @@ class Scanner: self.total_tasks_ever_enqueued += 1 finally: with self.processing_lock: - self.currently_processing.discard(target_item) + self.currently_processing.discard(task_tuple) if self._is_stop_requested(): print("Scan terminated due to stop request") @@ -857,7 +857,11 @@ class Scanner: 'graph_statistics': self.graph.get_statistics(), 'task_queue_size': self.task_queue.qsize(), 'currently_processing_count': currently_processing_count, - 'currently_processing': currently_processing_list[:5] + 'currently_processing': currently_processing_list[:5], + 'tasks_in_queue': self.task_queue.qsize(), + 'tasks_completed': self.indicators_completed, + 'tasks_skipped': self.total_tasks_ever_enqueued - self.task_queue.qsize() - self.indicators_completed - self.tasks_re_enqueued, + 'tasks_rescheduled': self.tasks_re_enqueued, } except Exception as e: print(f"ERROR: Exception in get_scan_status: {e}") @@ -876,7 +880,11 @@ class Scanner: 'graph_statistics': {}, 'task_queue_size': 0, 'currently_processing_count': 0, - 'currently_processing': [] + 'currently_processing': [], + 'tasks_in_queue': 0, + 'tasks_completed': 0, + 'tasks_skipped': 0, + 'tasks_rescheduled': 0, } def _initialize_provider_states(self, target: str) -> None: diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py index 6adde15..bb712a6 100644 --- a/providers/crtsh_provider.py +++ b/providers/crtsh_provider.py @@ -93,7 +93,7 @@ class CrtShProvider(BaseProvider): def query_domain(self, domain: str) -> ProviderResult: """ - Query crt.sh for certificates containing the domain with caching support. + Query crt.sh for certificates containing the domain with efficient, deduplicated caching. Args: domain: Domain to investigate @@ -110,35 +110,45 @@ class CrtShProvider(BaseProvider): cache_file = self._get_cache_file_path(domain) cache_status = self._get_cache_status(cache_file) - processed_certificates = [] result = ProviderResult() try: if cache_status == "fresh": result = self._load_from_cache(cache_file) - self.logger.logger.info(f"Using cached crt.sh data for {domain}") + self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}") else: # "stale" or "not_found" - raw_certificates = self._query_crtsh_api(domain) + # Query the API for the latest certificates + new_raw_certs = self._query_crtsh_api(domain) if self._stop_event and self._stop_event.is_set(): return ProviderResult() - # Process raw data into the application's expected format - current_processed_certs = [self._extract_certificate_metadata(cert) for cert in raw_certificates] - + # Combine with old data if cache is stale if cache_status == "stale": - # Load existing and append new processed certs - existing_result = self._load_from_cache(cache_file) - result = self._merge_results(existing_result, current_processed_certs, domain) - self.logger.logger.info(f"Refreshed and merged cache for {domain}") + old_raw_certs = self._load_raw_data_from_cache(cache_file) + combined_certs = old_raw_certs + new_raw_certs + + # Deduplicate the combined list + seen_ids = set() + unique_certs = [] + for cert in combined_certs: + cert_id = cert.get('id') + if cert_id not in seen_ids: + unique_certs.append(cert) + seen_ids.add(cert_id) + + raw_certificates_to_process = unique_certs + self.logger.logger.info(f"Refreshed and merged cache for {domain}. Total unique certs: {len(raw_certificates_to_process)}") else: # "not_found" - # Create new result from processed certs - result = self._process_certificates_to_result(domain, raw_certificates) - self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)") + raw_certificates_to_process = new_raw_certs + + # Process the clean, deduplicated list of certificates + result = self._process_certificates_to_result(domain, raw_certificates_to_process) + self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)") - # Save the result to cache - self._save_result_to_cache(cache_file, result, domain) + # Save the new result and the raw data to the cache + self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) except requests.exceptions.RequestException as e: self.logger.logger.error(f"API query failed for {domain}: {e}") @@ -200,12 +210,22 @@ class CrtShProvider(BaseProvider): self.logger.logger.error(f"Failed to load cached certificates from {cache_file_path}: {e}") return ProviderResult() - def _save_result_to_cache(self, cache_file_path: Path, result: ProviderResult, domain: str) -> None: - """Save processed crt.sh result to a cache file.""" + def _load_raw_data_from_cache(self, cache_file_path: Path) -> List[Dict[str, Any]]: + """Load only the raw certificate data from a cache file.""" + try: + with open(cache_file_path, 'r') as f: + cache_content = json.load(f) + return cache_content.get("raw_certificates", []) + except (json.JSONDecodeError, FileNotFoundError): + return [] + + def _save_result_to_cache(self, cache_file_path: Path, result: ProviderResult, raw_certificates: List[Dict[str, Any]], domain: str) -> None: + """Save processed crt.sh result and raw data to a cache file.""" try: cache_data = { "domain": domain, "last_upstream_query": datetime.now(timezone.utc).isoformat(), + "raw_certificates": raw_certificates, # Store the raw data for deduplication "relationships": [ { "source_node": rel.source_node, @@ -234,25 +254,6 @@ class CrtShProvider(BaseProvider): except Exception as e: self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}") - def _merge_results(self, existing_result: ProviderResult, new_certificates: List[Dict[str, Any]], domain: str) -> ProviderResult: - """Merge new certificate data with existing cached result.""" - # Create a fresh result from the new certificates - new_result = self._process_certificates_to_result(domain, new_certificates) - - # Simple merge strategy: combine all relationships and attributes - # In practice, you might want more sophisticated deduplication - merged_result = ProviderResult() - - # Add existing relationships and attributes - merged_result.relationships.extend(existing_result.relationships) - merged_result.attributes.extend(existing_result.attributes) - - # Add new relationships and attributes - merged_result.relationships.extend(new_result.relationships) - merged_result.attributes.extend(new_result.attributes) - - return merged_result - def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]: """Query crt.sh API for raw certificate data.""" url = f"{self.base_url}?q={quote(domain)}&output=json" @@ -261,7 +262,12 @@ class CrtShProvider(BaseProvider): if not response or response.status_code != 200: raise requests.exceptions.RequestException(f"crt.sh API returned status {response.status_code if response else 'None'}") - certificates = response.json() + try: + certificates = response.json() + except json.JSONDecodeError: + self.logger.logger.error(f"crt.sh returned invalid JSON for {domain}") + return [] + if not certificates: return [] @@ -324,7 +330,7 @@ class CrtShProvider(BaseProvider): result.add_relationship( source_node=domain, target_node=discovered_domain, - relationship_type='san_certificate', + relationship_type='crtsh_san_certificate', provider=self.name, confidence=confidence, raw_data={'relationship_type': 'certificate_discovery'} @@ -333,7 +339,7 @@ class CrtShProvider(BaseProvider): self.log_relationship_discovery( source_node=domain, target_node=discovered_domain, - relationship_type='san_certificate', + relationship_type='crtsh_san_certificate', confidence_score=confidence, raw_data={'relationship_type': 'certificate_discovery'}, discovery_method="certificate_transparency_analysis" diff --git a/providers/dns_provider.py b/providers/dns_provider.py index 6a855d9..2d03a9b 100644 --- a/providers/dns_provider.py +++ b/providers/dns_provider.py @@ -107,7 +107,7 @@ class DNSProvider(BaseProvider): result.add_relationship( source_node=ip, target_node=hostname, - relationship_type='ptr_record', + relationship_type='dns_ptr_record', provider=self.name, confidence=0.8, raw_data={ @@ -125,7 +125,7 @@ class DNSProvider(BaseProvider): self.log_relationship_discovery( source_node=ip, target_node=hostname, - relationship_type='ptr_record', + relationship_type='dns_ptr_record', confidence_score=0.8, raw_data={ 'query_type': 'PTR', @@ -202,7 +202,7 @@ class DNSProvider(BaseProvider): 'value': target, 'ttl': response.ttl } - relationship_type = f"{record_type.lower()}_record" + relationship_type = f"dns_{record_type.lower()}_record" confidence = 0.8 # Add relationship diff --git a/providers/shodan_provider.py b/providers/shodan_provider.py index 00d0cea..0e7004c 100644 --- a/providers/shodan_provider.py +++ b/providers/shodan_provider.py @@ -222,7 +222,7 @@ class ShodanProvider(BaseProvider): result.add_relationship( source_node=ip, target_node=hostname, - relationship_type='a_record', + relationship_type='shodan_a_record', provider=self.name, confidence=0.8, raw_data=data @@ -230,7 +230,7 @@ class ShodanProvider(BaseProvider): self.log_relationship_discovery( source_node=ip, target_node=hostname, - relationship_type='a_record', + relationship_type='shodan_a_record', confidence_score=0.8, raw_data=data, discovery_method="shodan_host_lookup" @@ -240,7 +240,7 @@ class ShodanProvider(BaseProvider): result.add_relationship( source_node=ip, target_node=asn_name, - relationship_type='asn_membership', + relationship_type='shodan_asn_membership', provider=self.name, confidence=0.7, raw_data=data @@ -248,7 +248,7 @@ class ShodanProvider(BaseProvider): self.log_relationship_discovery( source_node=ip, target_node=asn_name, - relationship_type='asn_membership', + relationship_type='shodan_asn_membership', confidence_score=0.7, raw_data=data, discovery_method="shodan_asn_lookup" @@ -257,9 +257,9 @@ class ShodanProvider(BaseProvider): for port in value: result.add_attribute( target_node=ip, - name='open_port', + name='shodan_open_port', value=port, - attr_type='network_info', + attr_type='shodan_network_info', provider=self.name, confidence=0.9 )