From f445187025c58aaed33b2267cfebe8de9d63711c Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Fri, 12 Sep 2025 10:08:03 +0200 Subject: [PATCH] it --- core/scanner.py | 605 +++++++++++++++++++++--------------- providers/crtsh_provider.py | 57 ++-- 2 files changed, 390 insertions(+), 272 deletions(-) diff --git a/core/scanner.py b/core/scanner.py index 9178b60..af72dbc 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -1,6 +1,7 @@ """ Main scanning orchestrator for DNSRecon. Coordinates data gathering from multiple providers and builds the infrastructure graph. +REFACTORED: Simplified recursion with forensic provider state tracking. """ import threading @@ -8,6 +9,7 @@ import traceback from typing import List, Set, Dict, Any, Tuple from concurrent.futures import ThreadPoolExecutor, as_completed, CancelledError from collections import defaultdict +from datetime import datetime, timezone from core.graph_manager import GraphManager, NodeType, RelationshipType from core.logger import get_forensic_logger, new_session @@ -15,7 +17,6 @@ from utils.helpers import _is_valid_ip, _is_valid_domain from providers.crtsh_provider import CrtShProvider from providers.dns_provider import DNSProvider from providers.shodan_provider import ShodanProvider -from providers.virustotal_provider import VirusTotalProvider class ScanStatus: @@ -30,7 +31,7 @@ class ScanStatus: class Scanner: """ Main scanning orchestrator for DNSRecon passive reconnaissance. - Now supports per-session configuration for multi-user isolation. + REFACTORED: Simplified recursion with forensic provider state tracking. """ def __init__(self, session_config=None): @@ -62,6 +63,14 @@ class Scanner: self.max_workers = self.config.max_concurrent_requests self.executor = None + # Provider eligibility mapping + self.provider_eligibility = { + 'dns': {'domains': True, 'ips': True}, + 'crtsh': {'domains': True, 'ips': False}, + 'shodan': {'domains': True, 'ips': True}, + 'virustotal': {'domains': False, 'ips': False} # Disabled as requested + } + # Initialize providers with session config print("Calling _initialize_providers with session config...") self._initialize_providers() @@ -80,22 +89,21 @@ class Scanner: def _initialize_providers(self) -> None: """Initialize all available providers based on session configuration.""" self.providers = [] - print("Initializing providers with session config...") - # Always add free providers - free_providers = [ - ('crtsh', CrtShProvider), - ('dns', DNSProvider) - ] + # Provider classes mapping + provider_classes = { + 'dns': DNSProvider, + 'crtsh': CrtShProvider, + 'shodan': ShodanProvider, + # Skip virustotal as requested + } - for provider_name, provider_class in free_providers: + for provider_name, provider_class in provider_classes.items(): if self.config.is_provider_enabled(provider_name): try: - # Pass session config to provider provider = provider_class(session_config=self.config) if provider.is_available(): - # Set the stop event for cancellation support provider.set_stop_event(self.stop_event) self.providers.append(provider) print(f"✓ {provider_name.title()} provider initialized successfully for session") @@ -105,70 +113,38 @@ class Scanner: print(f"✗ Failed to initialize {provider_name.title()} provider: {e}") traceback.print_exc() - # Add API key-dependent providers - api_providers = [ - ('shodan', ShodanProvider), - ('virustotal', VirusTotalProvider) - ] - - for provider_name, provider_class in api_providers: - if self.config.is_provider_enabled(provider_name): - try: - # Pass session config to provider - provider = provider_class(session_config=self.config) - if provider.is_available(): - # Set the stop event for cancellation support - provider.set_stop_event(self.stop_event) - self.providers.append(provider) - print(f"✓ {provider_name.title()} provider initialized successfully for session") - else: - print(f"✗ {provider_name.title()} provider is not available (API key required)") - except Exception as e: - print(f"✗ Failed to initialize {provider_name.title()} provider: {e}") - traceback.print_exc() - print(f"Initialized {len(self.providers)} providers for session") def update_session_config(self, new_config) -> None: - """ - Update session configuration and reinitialize providers. - - Args: - new_config: New SessionConfig instance - """ + """Update session configuration and reinitialize providers.""" print("Updating session configuration...") self.config = new_config self.max_workers = self.config.max_concurrent_requests self._initialize_providers() print("Session configuration updated") - def start_scan(self, target_domain: str, max_depth: int = 2, clear_graph: bool = True) -> bool: - """ - Start a new reconnaissance scan. - Forcefully cleans up any previous scan thread before starting. - """ + """Start a new reconnaissance scan with forensic tracking.""" print(f"=== STARTING SCAN IN SCANNER {id(self)} ===") print(f"Initial scanner status: {self.status}") - # If a thread is still alive from a previous scan, we must wait for it to die. + # Clean up previous scan thread if needed if self.scan_thread and self.scan_thread.is_alive(): print("A previous scan thread is still alive. Sending termination signal and waiting...") self.stop_scan() - self.scan_thread.join(10.0) # Wait up to 10 seconds + self.scan_thread.join(10.0) if self.scan_thread.is_alive(): - print("ERROR: The previous scan thread is unresponsive and could not be stopped. Please restart the application.") + print("ERROR: The previous scan thread is unresponsive and could not be stopped.") self.status = ScanStatus.FAILED return False print("Previous scan thread terminated successfully.") - # Reset state for the new scan + # Reset state for new scan self.status = ScanStatus.IDLE - print(f"Scanner state is now clean for a new scan.") + print("Scanner state is now clean for a new scan.") try: - # Check if we have any providers if not hasattr(self, 'providers') or not self.providers: print(f"ERROR: No providers available in scanner {id(self)}, cannot start scan") return False @@ -208,16 +184,13 @@ class Scanner: return False def _execute_scan(self, target_domain: str, max_depth: int) -> None: - """ - Execute the reconnaissance scan with concurrent provider queries. - - Args: - target_domain: Target domain to investigate - max_depth: Maximum recursion depth - """ + """Execute the reconnaissance scan with simplified recursion and forensic tracking.""" print(f"_execute_scan started for {target_domain} with depth {max_depth}") self.executor = ThreadPoolExecutor(max_workers=self.max_workers) + # Initialize variables outside try block + processed_targets = set() # Fix: Initialize here + try: print("Setting status to RUNNING") self.status = ScanStatus.RUNNING @@ -227,15 +200,16 @@ class Scanner: self.logger.log_scan_start(target_domain, max_depth, enabled_providers) print(f"Logged scan start with providers: {enabled_providers}") - # Initialize with target domain + # Initialize with target domain and track it print(f"Adding target domain '{target_domain}' as initial node") self.graph.add_node(target_domain, NodeType.DOMAIN) + self._initialize_provider_states(target_domain) - # BFS-style exploration + # BFS-style exploration with simplified recursion current_level_targets = {target_domain} - processed_targets = set() + all_discovered_targets = set() # Track all discovered targets for large entity detection - print("Starting BFS exploration...") + print("Starting BFS exploration with simplified recursion...") for depth in range(max_depth + 1): if self.stop_event.is_set(): @@ -251,14 +225,20 @@ class Scanner: self.total_indicators_found += len(current_level_targets) - target_results = self._process_targets_concurrent(current_level_targets, processed_targets) + # Process targets and collect newly discovered ones + target_results = self._process_targets_concurrent_forensic( + current_level_targets, processed_targets, all_discovered_targets, depth + ) next_level_targets = set() for target, new_targets in target_results: processed_targets.add(target) + all_discovered_targets.update(new_targets) + + # Simple recursion rule: only valid IPs and domains within depth limit if depth < max_depth: for new_target in new_targets: - if new_target not in processed_targets: + if self._should_recurse_on_target(new_target, processed_targets, all_discovered_targets): next_level_targets.add(new_target) current_level_targets = next_level_targets @@ -284,20 +264,60 @@ class Scanner: print("Final scan statistics:") print(f" - Total nodes: {stats['basic_metrics']['total_nodes']}") print(f" - Total edges: {stats['basic_metrics']['total_edges']}") - print(f" - Targets processed: {len(processed_targets)}") + print(f" - Targets processed: {len(processed_targets)}") + def _initialize_provider_states(self, target: str) -> None: + """Initialize provider states for forensic tracking.""" + if not self.graph.graph.has_node(target): # Fix: Use .graph.has_node() + return + + node_data = self.graph.graph.nodes[target] + if 'metadata' not in node_data: + node_data['metadata'] = {} + if 'provider_states' not in node_data['metadata']: + node_data['metadata']['provider_states'] = {} - def _process_targets_concurrent(self, targets: Set[str], processed_targets: Set[str]) -> List[Tuple[str, Set[str]]]: - """Process multiple targets (domains or IPs) concurrently using a thread pool.""" + def _should_recurse_on_target(self, target: str, processed_targets: Set[str], all_discovered: Set[str]) -> bool: + """ + Simplified recursion logic: only recurse on valid IPs and domains that haven't been processed. + FORENSIC PRINCIPLE: Clear, simple rules for what gets recursed. + """ + # Don't recurse on already processed targets + if target in processed_targets: + return False + + # Only recurse on valid IPs and domains + if not (_is_valid_ip(target) or _is_valid_domain(target)): + return False + + # Don't recurse on targets contained in large entities + if self._is_in_large_entity(target): + return False + + return True + + def _is_in_large_entity(self, target: str) -> bool: + """Check if a target is contained within a large entity node.""" + for node_id, node_data in self.graph.graph.nodes(data=True): + if node_data.get('type') == NodeType.LARGE_ENTITY.value: + metadata = node_data.get('metadata', {}) + contained_nodes = metadata.get('nodes', []) + if target in contained_nodes: + return True + return False + + def _process_targets_concurrent_forensic(self, targets: Set[str], processed_targets: Set[str], + all_discovered: Set[str], current_depth: int) -> List[Tuple[str, Set[str]]]: + """Process multiple targets concurrently with forensic provider state tracking.""" results = [] targets_to_process = targets - processed_targets if not targets_to_process: return results - print(f"Processing {len(targets_to_process)} targets concurrently with {self.max_workers} workers") + print(f"Processing {len(targets_to_process)} targets concurrently with forensic tracking") future_to_target = { - self.executor.submit(self._query_providers_for_target, target): target + self.executor.submit(self._query_providers_forensic, target, current_depth): target for target in targets_to_process } @@ -313,29 +333,37 @@ class Scanner: print(f"Completed processing target: {target} (found {len(new_targets)} new targets)") except (Exception, CancelledError) as e: print(f"Error processing target {target}: {e}") + self._log_target_processing_error(target, str(e)) return results - def _query_providers_for_target(self, target: str) -> Set[str]: + def _query_providers_forensic(self, target: str, current_depth: int) -> Set[str]: """ - Query all enabled providers for information about a target (domain or IP) and collect comprehensive metadata. - Creates appropriate node types and relationships based on discovered data. + Query providers for a target with forensic state tracking and simplified recursion. + REFACTORED: Simplified logic with complete forensic audit trail. """ is_ip = _is_valid_ip(target) target_type = NodeType.IP if is_ip else NodeType.DOMAIN - print(f"Querying {len(self.providers)} providers for {target_type.value}: {target}") + print(f"Querying providers for {target_type.value}: {target} at depth {current_depth}") + # Initialize node and provider states + self.graph.add_node(target, target_type) + self._initialize_provider_states(target) + new_targets = set() - all_relationships = [] + target_metadata = defaultdict(lambda: defaultdict(list)) - if not self.providers or self.stop_event.is_set(): + # Determine eligible providers for this target + eligible_providers = self._get_eligible_providers(target, is_ip) + + if not eligible_providers: + self._log_no_eligible_providers(target, is_ip) return new_targets - with ThreadPoolExecutor(max_workers=len(self.providers)) as provider_executor: + # Query each eligible provider with forensic tracking + with ThreadPoolExecutor(max_workers=len(eligible_providers)) as provider_executor: future_to_provider = { - provider_executor.submit( - self._safe_provider_query, provider, target, is_ip - ): provider - for provider in self.providers + provider_executor.submit(self._query_single_provider_forensic, provider, target, is_ip, current_depth): provider + for provider in eligible_providers } for future in as_completed(future_to_provider): @@ -345,60 +373,139 @@ class Scanner: provider = future_to_provider[future] try: - relationships = future.result() - print(f"Provider {provider.get_name()} returned {len(relationships)} relationships for {target}") - for rel in relationships: - source, rel_target, rel_type, confidence, raw_data = rel - enhanced_rel = (source, rel_target, rel_type, confidence, raw_data, provider.get_name()) - all_relationships.append(enhanced_rel) + provider_results = future.result() + if provider_results: + discovered_targets = self._process_provider_results_forensic( + target, provider, provider_results, target_metadata, current_depth + ) + new_targets.update(discovered_targets) except (Exception, CancelledError) as e: - print(f"Provider {provider.get_name()} failed for {target}: {e}") + self._log_provider_error(target, provider.get_name(), str(e)) - # NEW Step 2: Group all targets by type and identify large entities - discovered_targets_by_type = defaultdict(set) - for _, rel_target, _, _, _, _ in all_relationships: - if _is_valid_domain(rel_target): - discovered_targets_by_type[NodeType.DOMAIN].add(rel_target) - elif _is_valid_ip(rel_target): - discovered_targets_by_type[NodeType.IP].add(rel_target) + # Update node with collected metadata + if target_metadata[target]: + self.graph.add_node(target, target_type, metadata=dict(target_metadata[target])) - targets_to_skip = set() - for node_type, targets in discovered_targets_by_type.items(): - if len(targets) > self.config.large_entity_threshold: - print(f"Large number of {node_type.value}s ({len(targets)}) found for {target}. Creating a large entity node.") - first_rel = next((r for r in all_relationships if r[1] in targets), None) - if first_rel: - self._handle_large_entity(target, list(targets), first_rel[2], first_rel[5]) - targets_to_skip.update(targets) + return new_targets - # Step 3: Process all relationships to create/update nodes and edges - target_metadata = defaultdict(lambda: defaultdict(list)) + def _get_eligible_providers(self, target: str, is_ip: bool) -> List: + """Get providers eligible for querying this target.""" + eligible = [] + target_key = 'ips' if is_ip else 'domains' + + for provider in self.providers: + provider_name = provider.get_name() + if provider_name in self.provider_eligibility: + if self.provider_eligibility[provider_name][target_key]: + # Check if we already queried this provider for this target + if not self._already_queried_provider(target, provider_name): + eligible.append(provider) + else: + print(f"Skipping {provider_name} for {target} - already queried") + + return eligible + + def _already_queried_provider(self, target: str, provider_name: str) -> bool: + """Check if we already queried a provider for a target.""" + if not self.graph.graph.has_node(target): # Fix: Use .graph.has_node() + return False + + node_data = self.graph.graph.nodes[target] + provider_states = node_data.get('metadata', {}).get('provider_states', {}) + return provider_name in provider_states + + def _query_single_provider_forensic(self, provider, target: str, is_ip: bool, current_depth: int) -> List: + """Query a single provider with complete forensic logging.""" + provider_name = provider.get_name() + start_time = datetime.now(timezone.utc) + + print(f"Querying {provider_name} for {target}") + + # Log attempt + self.logger.logger.info(f"Attempting {provider_name} query for {target} at depth {current_depth}") + + try: + # Perform the query + if is_ip: + results = provider.query_ip(target) + else: + results = provider.query_domain(target) + + # Track successful state + self._update_provider_state(target, provider_name, 'success', len(results), None, start_time) + + print(f"✓ {provider_name} returned {len(results)} results for {target}") + return results + + except Exception as e: + # Track failed state + self._update_provider_state(target, provider_name, 'failed', 0, str(e), start_time) + print(f"✗ {provider_name} failed for {target}: {e}") + raise + + def _update_provider_state(self, target: str, provider_name: str, status: str, + results_count: int, error: str, start_time: datetime) -> None: + """Update provider state in node metadata for forensic tracking.""" + if not self.graph.graph.has_node(target): # Fix: Use .graph.has_node() + return + + node_data = self.graph.graph.nodes[target] + if 'metadata' not in node_data: + node_data['metadata'] = {} + if 'provider_states' not in node_data['metadata']: + node_data['metadata']['provider_states'] = {} + + node_data['metadata']['provider_states'][provider_name] = { + 'status': status, + 'timestamp': start_time.isoformat(), + 'results_count': results_count, + 'error': error, + 'duration_ms': (datetime.now(timezone.utc) - start_time).total_seconds() * 1000 + } + + # Log to forensic trail + self.logger.logger.info(f"Provider state updated: {target} -> {provider_name} -> {status} ({results_count} results)") + + def _process_provider_results_forensic(self, target: str, provider, results: List, + target_metadata: Dict, current_depth: int) -> Set[str]: + """Process provider results with large entity detection and forensic logging.""" + provider_name = provider.get_name() + discovered_targets = set() + + # Check for large entity threshold per provider + if len(results) > self.config.large_entity_threshold: + print(f"Large entity detected: {provider_name} returned {len(results)} results for {target}") + self._create_large_entity(target, provider_name, results, current_depth) + # Large entities block recursion - return empty set + return discovered_targets + + # Process each relationship dns_records_to_create = {} - for source, rel_target, rel_type, confidence, raw_data, provider_name in all_relationships: + for source, rel_target, rel_type, confidence, raw_data in results: if self.stop_event.is_set(): break - # Special handling for crt.sh to distribute certificate metadata - if provider_name == 'crtsh' and 'domain_certificates' in raw_data: - domain_certs = raw_data.get('domain_certificates', {}) - for cert_domain, cert_summary in domain_certs.items(): - if _is_valid_domain(cert_domain) and cert_domain not in targets_to_skip: - self.graph.add_node(cert_domain, NodeType.DOMAIN, metadata={'certificate_data': cert_summary}) + # Enhanced forensic logging for each relationship + self.logger.log_relationship_discovery( + source_node=source, + target_node=rel_target, + relationship_type=rel_type.relationship_name, + confidence_score=confidence, + provider=provider_name, + raw_data=raw_data, + discovery_method=f"{provider_name}_query_depth_{current_depth}" + ) - # General metadata collection - self._collect_node_metadata(source, provider_name, rel_type, rel_target, raw_data, target_metadata[source]) + # Collect metadata for source node + self._collect_node_metadata_forensic(source, provider_name, rel_type, rel_target, raw_data, target_metadata[source]) - # Add nodes and edges to the graph - if rel_target in targets_to_skip: - continue - + # Add nodes and edges based on target type if _is_valid_ip(rel_target): self.graph.add_node(rel_target, NodeType.IP) if self.graph.add_edge(source, rel_target, rel_type, confidence, provider_name, raw_data): print(f"Added IP relationship: {source} -> {rel_target} ({rel_type.relationship_name})") - if rel_type in [RelationshipType.A_RECORD, RelationshipType.AAAA_RECORD]: - new_targets.add(rel_target) + discovered_targets.add(rel_target) elif rel_target.startswith('AS') and rel_target[2:].isdigit(): self.graph.add_node(rel_target, NodeType.ASN) @@ -409,74 +516,74 @@ class Scanner: self.graph.add_node(rel_target, NodeType.DOMAIN) if self.graph.add_edge(source, rel_target, rel_type, confidence, provider_name, raw_data): print(f"Added domain relationship: {source} -> {rel_target} ({rel_type.relationship_name})") - - recurse_types = [ - RelationshipType.CNAME_RECORD, RelationshipType.MX_RECORD, - RelationshipType.SAN_CERTIFICATE, RelationshipType.NS_RECORD, - RelationshipType.PASSIVE_DNS - ] - if rel_type in recurse_types: - new_targets.add(rel_target) - + discovered_targets.add(rel_target) + + # *** NEW: Enrich the newly discovered domain *** + self._collect_node_metadata_forensic(rel_target, provider_name, rel_type, source, raw_data, target_metadata[rel_target]) + else: - # Handle DNS record content - dns_record_types = [ - RelationshipType.TXT_RECORD, RelationshipType.SPF_RECORD, - RelationshipType.CAA_RECORD, RelationshipType.SRV_RECORD, - RelationshipType.DNSKEY_RECORD, RelationshipType.DS_RECORD, - RelationshipType.RRSIG_RECORD, RelationshipType.SSHFP_RECORD, - RelationshipType.TLSA_RECORD, RelationshipType.NAPTR_RECORD - ] - if rel_type in dns_record_types: - record_type = rel_type.relationship_name.upper().replace('_RECORD', '') - record_content = rel_target.strip() - content_hash = hash(record_content) & 0x7FFFFFFF - dns_record_id = f"{record_type}:{content_hash}" - - if dns_record_id not in dns_records_to_create: - dns_records_to_create[dns_record_id] = { - 'content': record_content, 'type': record_type, 'domains': set(), - 'raw_data': raw_data, 'provider_name': provider_name, 'confidence': confidence - } - dns_records_to_create[dns_record_id]['domains'].add(source) - - # Step 4: Update the source node with its collected metadata - if target in target_metadata: - self.graph.add_node(target, target_type, metadata=dict(target_metadata[target])) + # Handle DNS record content + self._handle_dns_record_content(source, rel_target, rel_type, confidence, raw_data, provider_name, dns_records_to_create) - # Step 5: Create DNS record nodes and edges - for dns_record_id, record_info in dns_records_to_create.items(): - record_metadata = { - 'record_type': record_info['type'], 'content': record_info['content'], - 'content_hash': dns_record_id.split(':')[1], - 'associated_domains': list(record_info['domains']), - 'source_data': record_info['raw_data'] - } - self.graph.add_node(dns_record_id, NodeType.DNS_RECORD, metadata=record_metadata) - for domain_name in record_info['domains']: - self.graph.add_edge(domain_name, dns_record_id, RelationshipType.DNS_RECORD, - record_info['confidence'], record_info['provider_name'], - record_info['raw_data']) - - return new_targets + # Create DNS record nodes + self._create_dns_record_nodes(dns_records_to_create) + + return discovered_targets + + def _create_large_entity(self, source: str, provider_name: str, results: List, current_depth: int) -> None: + """Create a large entity node for forensic tracking.""" + entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}" + + # Extract targets from results + targets = [rel[1] for rel in results if len(rel) > 1] + + # Determine node type + node_type = 'unknown' + if targets: + if _is_valid_domain(targets[0]): + node_type = 'domain' + elif _is_valid_ip(targets[0]): + node_type = 'ip' + + # Create large entity metadata + metadata = { + 'count': len(targets), + 'nodes': targets, + 'node_type': node_type, + 'source_provider': provider_name, + 'discovery_depth': current_depth, + 'threshold_exceeded': self.config.large_entity_threshold, + 'forensic_note': f'Large entity created due to {len(targets)} results from {provider_name}' + } + + # Create the node and edge + self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, metadata=metadata) + + # Use first result's relationship type for the edge + if results: + rel_type = results[0][2] + self.graph.add_edge(source, entity_id, rel_type, 0.9, provider_name, + {'large_entity_info': f'Contains {len(targets)} {node_type}s'}) + + # Forensic logging + self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(targets)} targets from {provider_name}") + + print(f"Created large entity {entity_id} for {len(targets)} {node_type}s from {provider_name}") + + def _collect_node_metadata_forensic(self, node_id: str, provider_name: str, rel_type: RelationshipType, + target: str, raw_data: Dict[str, Any], metadata: Dict[str, Any]) -> None: + """Collect and organize metadata for forensic tracking with enhanced logging.""" + + # Log metadata collection + self.logger.logger.debug(f"Collecting metadata for {node_id} from {provider_name}: {rel_type.relationship_name}") - def _collect_node_metadata(self, node_id: str, provider_name: str, rel_type: RelationshipType, - target: str, raw_data: Dict[str, Any], metadata: Dict[str, Any]) -> None: - """ - Collect and organize metadata for a node based on provider responses. - """ if provider_name == 'dns': record_type = raw_data.get('query_type', 'UNKNOWN') value = raw_data.get('value', target) - - if record_type in ['TXT', 'SPF', 'CAA']: - dns_entry = f"{record_type}: {value}" - else: - dns_entry = f"{record_type}: {value}" - + dns_entry = f"{record_type}: {value}" if dns_entry not in metadata.get('dns_records', []): metadata.setdefault('dns_records', []).append(dns_entry) - + elif provider_name == 'crtsh': if rel_type == RelationshipType.SAN_CERTIFICATE: domain_certs = raw_data.get('domain_certificates', {}) @@ -486,24 +593,13 @@ class Scanner: metadata['has_valid_cert'] = cert_summary.get('has_valid_cert', False) if target not in metadata.get('related_domains_san', []): metadata.setdefault('related_domains_san', []).append(target) - shared_certs = raw_data.get('shared_certificates', []) - if shared_certs and 'shared_certificate_details' not in metadata: - metadata['shared_certificate_details'] = shared_certs - + elif provider_name == 'shodan': for key, value in raw_data.items(): if key not in metadata.get('shodan', {}) or not metadata.get('shodan', {}).get(key): metadata.setdefault('shodan', {})[key] = value - - elif provider_name == 'virustotal': - for key, value in raw_data.items(): - if key not in metadata.get('virustotal', {}) or not metadata.get('virustotal', {}).get(key): - metadata.setdefault('virustotal', {})[key] = value - if rel_type == RelationshipType.PASSIVE_DNS: - passive_entry = f"Passive DNS: {target}" - if passive_entry not in metadata.get('passive_dns', []): - metadata.setdefault('passive_dns', []).append(passive_entry) - + + # Track ASN data if rel_type == RelationshipType.ASN_MEMBERSHIP: metadata['asn_data'] = { 'asn': target, @@ -512,48 +608,82 @@ class Scanner: 'country': raw_data.get('country', '') } - def _handle_large_entity(self, source: str, targets: list, rel_type: RelationshipType, provider_name: str): - """ - Handles the creation of a large entity node when a threshold is exceeded. - """ - print(f"Large number of {rel_type.name} relationships for {source}. Creating a large entity node.") - entity_name = f"Large collection of {rel_type.name} for {source}" - node_type = 'unknown' - if targets: - if _is_valid_domain(targets[0]): - node_type = 'domain' - elif _is_valid_ip(targets[0]): - node_type = 'ip' - self.graph.add_node(entity_name, NodeType.LARGE_ENTITY, metadata={"count": len(targets), "nodes": targets, "node_type": node_type}) - self.graph.add_edge(source, entity_name, rel_type, 0.9, provider_name, {"info": "Aggregated node"}) + def _handle_dns_record_content(self, source: str, rel_target: str, rel_type: RelationshipType, + confidence: float, raw_data: Dict[str, Any], provider_name: str, + dns_records: Dict) -> None: + """Handle DNS record content with forensic tracking.""" + dns_record_types = [ + RelationshipType.TXT_RECORD, RelationshipType.SPF_RECORD, + RelationshipType.CAA_RECORD, RelationshipType.SRV_RECORD, + RelationshipType.DNSKEY_RECORD, RelationshipType.DS_RECORD, + RelationshipType.RRSIG_RECORD, RelationshipType.SSHFP_RECORD, + RelationshipType.TLSA_RECORD, RelationshipType.NAPTR_RECORD + ] + + if rel_type in dns_record_types: + record_type = rel_type.relationship_name.upper().replace('_RECORD', '') + record_content = rel_target.strip() + content_hash = hash(record_content) & 0x7FFFFFFF + dns_record_id = f"{record_type}:{content_hash}" + + if dns_record_id not in dns_records: + dns_records[dns_record_id] = { + 'content': record_content, + 'type': record_type, + 'domains': set(), + 'raw_data': raw_data, + 'provider_name': provider_name, + 'confidence': confidence + } + dns_records[dns_record_id]['domains'].add(source) - def _safe_provider_query(self, provider, target: str, is_ip: bool) -> List[Tuple[str, str, RelationshipType, float, Dict[str, Any]]]: - """Safely query a provider for a target with error handling.""" - if self.stop_event.is_set(): - return [] - try: - if is_ip: - return provider.query_ip(target) - else: - return provider.query_domain(target) - except Exception as e: - print(f"Provider {provider.get_name()} query failed for {target}: {e}") - return [] + def _create_dns_record_nodes(self, dns_records: Dict) -> None: + """Create DNS record nodes with forensic metadata.""" + for dns_record_id, record_info in dns_records.items(): + record_metadata = { + 'record_type': record_info['type'], + 'content': record_info['content'], + 'content_hash': dns_record_id.split(':')[1], + 'associated_domains': list(record_info['domains']), + 'source_data': record_info['raw_data'], + 'forensic_note': f"DNS record created from {record_info['provider_name']} query" + } + + self.graph.add_node(dns_record_id, NodeType.DNS_RECORD, metadata=record_metadata) + + for domain_name in record_info['domains']: + self.graph.add_edge(domain_name, dns_record_id, RelationshipType.DNS_RECORD, + record_info['confidence'], record_info['provider_name'], + record_info['raw_data']) + + # Forensic logging for DNS record creation + self.logger.logger.info(f"DNS record node created: {dns_record_id} for {len(record_info['domains'])} domains") + + def _log_target_processing_error(self, target: str, error: str) -> None: + """Log target processing errors for forensic trail.""" + self.logger.logger.error(f"Target processing failed for {target}: {error}") + + def _log_provider_error(self, target: str, provider_name: str, error: str) -> None: + """Log provider query errors for forensic trail.""" + self.logger.logger.error(f"Provider {provider_name} failed for {target}: {error}") + + def _log_no_eligible_providers(self, target: str, is_ip: bool) -> None: + """Log when no providers are eligible for a target.""" + target_type = 'IP' if is_ip else 'domain' + self.logger.logger.warning(f"No eligible providers for {target_type}: {target}") def stop_scan(self) -> bool: - """ - Request immediate scan termination. - Acts on the thread's liveness, not just the 'RUNNING' status. - """ + """Request immediate scan termination with forensic logging.""" try: if not self.scan_thread or not self.scan_thread.is_alive(): print("No active scan thread to stop.") - # Cleanup state if inconsistent if self.status == ScanStatus.RUNNING: self.status = ScanStatus.STOPPED return False print("=== INITIATING IMMEDIATE SCAN TERMINATION ===") + self.logger.logger.info("Scan termination requested by user") + self.status = ScanStatus.STOPPED self.stop_event.set() @@ -563,22 +693,15 @@ class Scanner: print("Termination signal sent. The scan thread will stop shortly.") return True + except Exception as e: print(f"ERROR: Exception in stop_scan: {e}") + self.logger.logger.error(f"Error during scan termination: {e}") traceback.print_exc() return False - def _force_stop_completion(self): - """Force completion of stop operation after timeout.""" - if self.status == ScanStatus.RUNNING: - print("Forcing scan termination after timeout") - self.status = ScanStatus.STOPPED - self.logger.log_scan_complete() - def get_scan_status(self) -> Dict[str, Any]: - """ - Get current scan status and progress. - """ + """Get current scan status with forensic information.""" try: return { 'status': self.status, @@ -615,27 +738,25 @@ class Scanner: return min(100.0, (self.indicators_processed / self.total_indicators_found) * 100) def get_graph_data(self) -> Dict[str, Any]: - """ - Get current graph data for visualization. - """ + """Get current graph data for visualization.""" return self.graph.get_graph_data() def export_results(self) -> Dict[str, Any]: - """ - Export complete scan results including graph and audit trail. - """ + """Export complete scan results with forensic audit trail.""" graph_data = self.graph.export_json() audit_trail = self.logger.export_audit_trail() provider_stats = {} for provider in self.providers: provider_stats[provider.get_name()] = provider.get_statistics() + export_data = { 'scan_metadata': { 'target_domain': self.current_target, 'max_depth': self.max_depth, 'final_status': self.status, 'total_indicators_processed': self.indicators_processed, - 'enabled_providers': list(provider_stats.keys()) + 'enabled_providers': list(provider_stats.keys()), + 'forensic_note': 'Refactored scanner with simplified recursion and forensic tracking' }, 'graph_data': graph_data, 'forensic_audit': audit_trail, @@ -645,9 +766,7 @@ class Scanner: return export_data def get_provider_statistics(self) -> Dict[str, Dict[str, Any]]: - """ - Get statistics for all providers. - """ + """Get statistics for all providers with forensic information.""" stats = {} for provider in self.providers: stats[provider.get_name()] = provider.get_statistics() diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py index fc977f2..30ec5ea 100644 --- a/providers/crtsh_provider.py +++ b/providers/crtsh_provider.py @@ -478,57 +478,56 @@ class CrtShProvider(BaseProvider): common_name = cert_data.get('common_name', '') if common_name: cleaned_cn = self._clean_domain_name(common_name) - if cleaned_cn and _is_valid_domain(cleaned_cn): - domains.add(cleaned_cn) + if cleaned_cn: + domains.update(cleaned_cn) # Extract from name_value field (contains SANs) name_value = cert_data.get('name_value', '') if name_value: # Split by newlines and clean each domain for line in name_value.split('\n'): - cleaned_domain = self._clean_domain_name(line.strip()) - if cleaned_domain and _is_valid_domain(cleaned_domain): - domains.add(cleaned_domain) + cleaned_domains = self._clean_domain_name(line.strip()) + if cleaned_domains: + domains.update(cleaned_domains) return domains - def _clean_domain_name(self, domain_name: str) -> str: + def _clean_domain_name(self, domain_name: str) -> List[str]: """ Clean and normalize domain name from certificate data. - - Args: - domain_name: Raw domain name from certificate - - Returns: - Cleaned domain name or empty string if invalid + Now returns a list to handle wildcards correctly. """ if not domain_name: - return "" - - # Remove common prefixes and clean up + return [] + domain = domain_name.strip().lower() - + # Remove protocol if present if domain.startswith(('http://', 'https://')): domain = domain.split('://', 1)[1] - + # Remove path if present if '/' in domain: domain = domain.split('/', 1)[0] - + # Remove port if present if ':' in domain and not domain.count(':') > 1: # Avoid breaking IPv6 domain = domain.split(':', 1)[0] - + # Handle wildcard domains + cleaned_domains = [] if domain.startswith('*.'): - domain = domain[2:] - - # Remove any remaining invalid characters - domain = re.sub(r'[^\w\-\.]', '', domain) - - # Ensure it's not empty and doesn't start/end with dots or hyphens - if domain and not domain.startswith(('.', '-')) and not domain.endswith(('.', '-')): - return domain - - return "" \ No newline at end of file + # Add both the wildcard and the base domain + cleaned_domains.append(domain) + cleaned_domains.append(domain[2:]) + else: + cleaned_domains.append(domain) + + # Remove any remaining invalid characters and validate + final_domains = [] + for d in cleaned_domains: + d = re.sub(r'[^\w\-\.]', '', d) + if d and not d.startswith(('.', '-')) and not d.endswith(('.', '-')): + final_domains.append(d) + + return [d for d in final_domains if _is_valid_domain(d)] \ No newline at end of file