This commit is contained in:
overcuriousity
2025-09-11 16:14:20 +02:00
parent d3e1fcf35f
commit 2a87403cb6
4 changed files with 151 additions and 133 deletions

View File

@@ -16,7 +16,6 @@ from providers.crtsh_provider import CrtShProvider
from providers.dns_provider import DNSProvider
from providers.shodan_provider import ShodanProvider
from providers.virustotal_provider import VirusTotalProvider
from config import config
class ScanStatus:
@@ -202,7 +201,7 @@ class Scanner:
# Start scan in separate thread
print(f"Starting scan thread for scanner {id(self)}...")
self.scan_thread = threading.Thread(
target=self._execute_scan_async,
target=self._execute_scan,
args=(self.current_target, max_depth),
daemon=True
)
@@ -216,15 +215,15 @@ class Scanner:
traceback.print_exc()
return False
async def _execute_scan_async(self, target_domain: str, max_depth: int) -> None:
def _execute_scan(self, target_domain: str, max_depth: int) -> None:
"""
Execute the reconnaissance scan asynchronously with concurrent provider queries.
Execute the reconnaissance scan with concurrent provider queries.
Args:
target_domain: Target domain to investigate
max_depth: Maximum recursion depth
"""
print(f"_execute_scan_async started for {target_domain} with depth {max_depth}")
print(f"_execute_scan started for {target_domain} with depth {max_depth}")
self.executor = ThreadPoolExecutor(max_workers=self.max_workers)
try:
@@ -275,14 +274,21 @@ class Scanner:
for discovered_domain in discovered_domains:
if discovered_domain not in processed_domains:
next_level_domains.add(discovered_domain)
print(f"Adding {discovered_domain} to next level")
print(f"Adding {discovered_domain} to next level from domain query")
if self.stop_event.is_set():
break
if all_discovered_ips:
print(f"Processing {len(all_discovered_ips)} discovered IP addresses")
self._process_ips_concurrent(all_discovered_ips)
# MODIFICATION START: Capture new domains from IP processing
new_domains_from_ips = self._process_ips_concurrent(all_discovered_ips)
if depth < max_depth:
for new_domain in new_domains_from_ips:
if new_domain not in processed_domains:
next_level_domains.add(new_domain)
print(f"Adding {new_domain} to next level from IP query")
# MODIFICATION END
current_level_domains = next_level_domains
print(f"Completed depth {depth}, {len(next_level_domains)} domains for next level")
@@ -341,12 +347,14 @@ class Scanner:
print(f"Error processing domain {domain}: {e}")
return results
def _process_ips_concurrent(self, ips: Set[str]) -> None:
def _process_ips_concurrent(self, ips: Set[str]) -> Set[str]: # MODIFICATION: Changed return type
"""
Process multiple IP addresses concurrently.
"""
all_discovered_domains = set() # NEW: Set to aggregate all results
if not ips or self.stop_event.is_set():
return
return all_discovered_domains # MODIFICATION: Return the new set
print(f"Processing {len(ips)} IP addresses concurrently")
future_to_ip = {
self.executor.submit(self._query_providers_for_ip, ip): ip
@@ -358,10 +366,15 @@ class Scanner:
continue
ip = future_to_ip[future]
try:
future.result() # Just wait for completion
print(f"Completed processing IP: {ip}")
# NEW: Get the set of domains from the future's result and update our aggregate set
discovered_domains_from_ip = future.result()
all_discovered_domains.update(discovered_domains_from_ip)
print(f"Completed processing IP: {ip}, found {len(discovered_domains_from_ip)} new domains.")
except (Exception, CancelledError) as e:
print(f"Error processing IP {ip}: {e}")
return all_discovered_domains # MODIFICATION: Return the final aggregated set
def _query_providers_for_domain(self, domain: str) -> Tuple[Set[str], Set[str]]:
"""
@@ -373,20 +386,10 @@ class Scanner:
discovered_ips = set()
all_relationships = []
# Comprehensive metadata collection for this domain
domain_metadata = {
'dns_records': [],
'related_domains_san': [],
'shodan': {},
'virustotal': {},
'certificate_data': {},
'passive_dns': [],
}
if not self.providers or self.stop_event.is_set():
return discovered_domains, discovered_ips
# Query all providers concurrently
# Step 1: Query all providers and gather all relationships
with ThreadPoolExecutor(max_workers=len(self.providers)) as provider_executor:
future_to_provider = {
provider_executor.submit(self._safe_provider_query_domain, provider, domain): provider
@@ -397,89 +400,88 @@ class Scanner:
if self.stop_event.is_set():
future.cancel()
continue
provider = future_to_provider[future]
try:
relationships = future.result()
print(f"Provider {provider.get_name()} returned {len(relationships)} relationships")
# Process relationships and collect metadata
print(f"Provider {provider.get_name()} returned {len(relationships)} relationships for {domain}")
for rel in relationships:
source, target, rel_type, confidence, raw_data = rel
# Add provider info to the relationship
enhanced_rel = (source, target, rel_type, confidence, raw_data, provider.get_name())
all_relationships.append(enhanced_rel)
# Collect metadata based on provider and relationship type
self._collect_node_metadata(domain, provider.get_name(), rel_type, target, raw_data, domain_metadata)
except (Exception, CancelledError) as e:
print(f"Provider {provider.get_name()} failed for {domain}: {e}")
# Add the domain node with comprehensive metadata
self.graph.add_node(domain, NodeType.DOMAIN, metadata=domain_metadata)
# NEW Step 2: Group all targets by type and identify large entities
discovered_targets_by_type = defaultdict(set)
for _, target, _, _, _, _ in all_relationships:
if _is_valid_domain(target):
discovered_targets_by_type[NodeType.DOMAIN].add(target)
elif _is_valid_ip(target):
discovered_targets_by_type[NodeType.IP].add(target)
# Group relationships by type for large entity handling
relationships_by_type = defaultdict(list)
for source, target, rel_type, confidence, raw_data, provider_name in all_relationships:
relationships_by_type[rel_type].append((source, target, rel_type, confidence, raw_data, provider_name))
targets_to_skip_recursion = set()
for node_type, targets in discovered_targets_by_type.items():
if len(targets) > self.config.large_entity_threshold:
print(f"Large number of {node_type.value}s ({len(targets)}) found for {domain}. Creating a large entity node.")
# We can use the first relationship's type and provider for the large entity node
first_rel = next((r for r in all_relationships if r[1] in targets), None)
if first_rel:
self._handle_large_entity(domain, list(targets), first_rel[2], first_rel[5])
targets_to_skip_recursion.update(targets)
# Handle large entities (only for SAN certificates currently)
for rel_type, relationships in relationships_by_type.items():
if len(relationships) > config.large_entity_threshold and rel_type == RelationshipType.SAN_CERTIFICATE:
first_provider = relationships[0][5] if relationships else "multiple_providers"
self._handle_large_entity(domain, relationships, rel_type, first_provider)
# Remove these relationships from further processing
all_relationships = [rel for rel in all_relationships if not (rel[2] == rel_type and len(relationships_by_type[rel_type]) > config.large_entity_threshold)]
# Track DNS records to create (avoid duplicates)
# Step 3: Process all relationships to create/update nodes and edges
domain_metadata = defaultdict(lambda: defaultdict(list))
dns_records_to_create = {}
# Process remaining relationships
for source, target, rel_type, confidence, raw_data, provider_name in all_relationships:
if self.stop_event.is_set():
break
# Determine how to handle the target based on relationship type and content
# Special handling for crt.sh to distribute certificate metadata
if provider_name == 'crtsh' and 'domain_certificates' in raw_data:
domain_certs = raw_data.get('domain_certificates', {})
for cert_domain, cert_summary in domain_certs.items():
if _is_valid_domain(cert_domain):
# Create the node with its metadata. If node exists, metadata is updated.
self.graph.add_node(cert_domain, NodeType.DOMAIN, metadata={'certificate_data': cert_summary})
# General metadata collection for the source domain
self._collect_node_metadata(source, provider_name, rel_type, target, raw_data, domain_metadata[source])
# Add nodes and edges to the graph
if target in targets_to_skip_recursion:
continue
if _is_valid_ip(target):
# Create IP node and relationship
self.graph.add_node(target, NodeType.IP)
if self.graph.add_edge(source, target, rel_type, confidence, provider_name, raw_data):
print(f"Added IP relationship: {source} -> {target} ({rel_type.relationship_name})")
# Add to recursion if it's a direct resolution
if rel_type in [RelationshipType.A_RECORD, RelationshipType.AAAA_RECORD]:
discovered_ips.add(target)
elif target.startswith('AS') and target[2:].isdigit():
# Create ASN node and relationship
self.graph.add_node(target, NodeType.ASN)
if self.graph.add_edge(source, target, rel_type, confidence, provider_name, raw_data):
print(f"Added ASN relationship: {source} -> {target} ({rel_type.relationship_name})")
elif _is_valid_domain(target):
# Create domain node and relationship
# Ensure the target node exists before adding an edge
self.graph.add_node(target, NodeType.DOMAIN)
if self.graph.add_edge(source, target, rel_type, confidence, provider_name, raw_data):
print(f"Added domain relationship: {source} -> {target} ({rel_type.relationship_name})")
# Add to recursion for specific relationship types
recurse_types = [
RelationshipType.CNAME_RECORD,
RelationshipType.MX_RECORD,
RelationshipType.SAN_CERTIFICATE,
RelationshipType.NS_RECORD,
RelationshipType.CNAME_RECORD, RelationshipType.MX_RECORD,
RelationshipType.SAN_CERTIFICATE, RelationshipType.NS_RECORD,
RelationshipType.PASSIVE_DNS
]
if rel_type in recurse_types:
discovered_domains.add(target)
else:
# Handle DNS record content (TXT, SPF, CAA, etc.)
# Handle DNS record content
dns_record_types = [
RelationshipType.TXT_RECORD, RelationshipType.SPF_RECORD,
RelationshipType.CAA_RECORD, RelationshipType.SRV_RECORD,
@@ -487,59 +489,38 @@ class Scanner:
RelationshipType.RRSIG_RECORD, RelationshipType.SSHFP_RECORD,
RelationshipType.TLSA_RECORD, RelationshipType.NAPTR_RECORD
]
if rel_type in dns_record_types:
# Create normalized DNS record identifier
record_type = rel_type.relationship_name.upper().replace('_RECORD', '')
record_content = target.strip()
# Create a unique identifier for this DNS record
content_hash = hash(record_content) & 0x7FFFFFFF
dns_record_id = f"{record_type}:{content_hash}"
# Track this DNS record for creation (avoid duplicates)
if dns_record_id not in dns_records_to_create:
dns_records_to_create[dns_record_id] = {
'content': record_content,
'type': record_type,
'domains': set(),
'raw_data': raw_data,
'provider_name': provider_name,
'confidence': confidence
'content': record_content, 'type': record_type, 'domains': set(),
'raw_data': raw_data, 'provider_name': provider_name, 'confidence': confidence
}
# Add this domain to the DNS record's domain list
dns_records_to_create[dns_record_id]['domains'].add(source)
print(f"DNS record tracked: {source} -> {record_type} (content length: {len(record_content)})")
else:
# For other non-infrastructure targets, log but don't create nodes
print(f"Non-infrastructure relationship stored as metadata: {source} - {rel_type.relationship_name}: {target[:100]}")
# Step 4: Update the source domain node with its collected metadata
if domain in domain_metadata:
self.graph.add_node(domain, NodeType.DOMAIN, metadata=dict(domain_metadata[domain]))
# Create DNS record nodes and their relationships
# Step 5: Create DNS record nodes and edges
for dns_record_id, record_info in dns_records_to_create.items():
if self.stop_event.is_set():
break
record_metadata = {
'record_type': record_info['type'],
'content': record_info['content'],
'record_type': record_info['type'], 'content': record_info['content'],
'content_hash': dns_record_id.split(':')[1],
'associated_domains': list(record_info['domains']),
'source_data': record_info['raw_data']
}
# Create the DNS record node
self.graph.add_node(dns_record_id, NodeType.DNS_RECORD, metadata=record_metadata)
# Connect each domain to this DNS record
for domain_name in record_info['domains']:
if self.graph.add_edge(domain_name, dns_record_id, RelationshipType.DNS_RECORD,
record_info['confidence'], record_info['provider_name'],
record_info['raw_data']):
print(f"Added DNS record relationship: {domain_name} -> {dns_record_id}")
self.graph.add_edge(domain_name, dns_record_id, RelationshipType.DNS_RECORD,
record_info['confidence'], record_info['provider_name'],
record_info['raw_data'])
print(f"Domain {domain}: discovered {len(discovered_domains)} domains, {len(discovered_ips)} IPs, {len(dns_records_to_create)} DNS records")
print(f"Domain {domain}: discovered {len(discovered_domains)} domains, {len(discovered_ips)} IPs")
return discovered_domains, discovered_ips
def _collect_node_metadata(self, node_id: str, provider_name: str, rel_type: RelationshipType,
@@ -611,13 +592,15 @@ class Scanner:
self.graph.add_node(entity_name, NodeType.LARGE_ENTITY, metadata={"count": len(relationships)})
self.graph.add_edge(source_domain, entity_name, rel_type, 0.9, provider_name, {"info": "Aggregated node"})
def _query_providers_for_ip(self, ip: str) -> None:
def _query_providers_for_ip(self, ip: str) -> Set[str]:
"""
Query all enabled providers for information about an IP address and collect comprehensive metadata.
"""
print(f"Querying {len(self.providers)} providers for IP: {ip}")
discovered_hostnames = set()
if not self.providers or self.stop_event.is_set():
return
return discovered_hostnames
# Comprehensive metadata collection for this IP
ip_metadata = {
@@ -646,35 +629,51 @@ class Scanner:
print(f"Provider {provider.get_name()} returned {len(relationships)} relationships for IP {ip}")
for source, target, rel_type, confidence, raw_data in relationships:
# Add provider info to the relationship
enhanced_rel = (source, target, rel_type, confidence, raw_data, provider.get_name())
all_relationships.append(enhanced_rel)
# Collect metadata for the IP
self._collect_ip_metadata(ip, provider.get_name(), rel_type, target, raw_data, ip_metadata)
except (Exception, CancelledError) as e:
print(f"Provider {provider.get_name()} failed for IP {ip}: {e}")
# NEW: Group all targets by type and identify large entities
discovered_targets_by_type = defaultdict(set)
for _, target, _, _, _, _ in all_relationships:
if _is_valid_domain(target):
discovered_targets_by_type[NodeType.DOMAIN].add(target)
# THE FIX IS HERE: Initialize the set before using it.
targets_to_skip = set()
for node_type, targets in discovered_targets_by_type.items():
if len(targets) > self.config.large_entity_threshold:
print(f"Large number of {node_type.value}s ({len(targets)}) found for IP {ip}. Creating a large entity node.")
first_rel = next((r for r in all_relationships if r[1] in targets), None)
if first_rel:
self._handle_large_entity(ip, list(targets), first_rel[2], first_rel[5])
targets_to_skip.update(targets)
# Update the IP node with comprehensive metadata
self.graph.add_node(ip, NodeType.IP, metadata=ip_metadata)
# Process relationships with correct provider attribution
for source, target, rel_type, confidence, raw_data, provider_name in all_relationships:
# Determine target node type
if target in targets_to_skip:
continue
if _is_valid_domain(target):
target_node_type = NodeType.DOMAIN
discovered_hostnames.add(target)
elif target.startswith('AS'):
target_node_type = NodeType.ASN
else:
target_node_type = NodeType.IP
# Create/update target node
self.graph.add_node(target, target_node_type)
# Add relationship with correct provider attribution
if self.graph.add_edge(source, target, rel_type, confidence, provider_name, raw_data):
print(f"Added IP relationship: {source} -> {target} ({rel_type.relationship_name}) from {provider_name}")
return discovered_hostnames
def _collect_ip_metadata(self, ip: str, provider_name: str, rel_type: RelationshipType,
target: str, raw_data: Dict[str, Any], metadata: Dict[str, Any]) -> None: