Merge pull request 'remove-large-entity-temporarily' (#3) from remove-large-entity-temporarily into main

Reviewed-on: mstoeck3/dnsrecon#3
This commit is contained in:
Mario Stöckl 2025-09-19 12:29:26 +00:00
commit 3ee23c9d05
6 changed files with 366 additions and 351 deletions

View File

@ -114,36 +114,6 @@ class GraphManager:
self.last_modified = datetime.now(timezone.utc).isoformat() self.last_modified = datetime.now(timezone.utc).isoformat()
return True return True
def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
"""
Removes a node from a large entity's internal lists and updates its count.
This prepares the large entity for the node's promotion to a regular node.
"""
if not self.graph.has_node(large_entity_id):
return False
node_data = self.graph.nodes[large_entity_id]
attributes = node_data.get('attributes', [])
# Find the 'nodes' attribute dictionary in the list
nodes_attr = next((attr for attr in attributes if attr.get('name') == 'nodes'), None)
# Remove from the list of member nodes
if nodes_attr and 'value' in nodes_attr and isinstance(nodes_attr['value'], list) and node_id_to_extract in nodes_attr['value']:
nodes_attr['value'].remove(node_id_to_extract)
# Find the 'count' attribute and update it
count_attr = next((attr for attr in attributes if attr.get('name') == 'count'), None)
if count_attr:
count_attr['value'] = len(nodes_attr['value'])
else:
# This can happen if the node was already extracted, which is not an error.
print(f"Warning: Node {node_id_to_extract} not found in the 'nodes' list of {large_entity_id}.")
return True # Proceed as if successful
self.last_modified = datetime.now(timezone.utc).isoformat()
return True
def remove_node(self, node_id: str) -> bool: def remove_node(self, node_id: str) -> bool:
"""Remove a node and its connected edges from the graph.""" """Remove a node and its connected edges from the graph."""
if not self.graph.has_node(node_id): if not self.graph.has_node(node_id):

View File

@ -761,7 +761,7 @@ class Scanner:
def _process_provider_task(self, provider: BaseProvider, target: str, depth: int) -> Tuple[Set[str], Set[str], bool]: def _process_provider_task(self, provider: BaseProvider, target: str, depth: int) -> Tuple[Set[str], Set[str], bool]:
""" """
Manages the entire process for a given target and provider. Manages the entire process for a given target and provider.
FIXED: Don't enqueue correlation tasks during normal processing. This version is generalized to handle all relationships dynamically.
""" """
if self._is_stop_requested(): if self._is_stop_requested():
return set(), set(), False return set(), set(), False
@ -773,7 +773,6 @@ class Scanner:
self._initialize_provider_states(target) self._initialize_provider_states(target)
new_targets = set() new_targets = set()
large_entity_members = set()
provider_successful = True provider_successful = True
try: try:
@ -782,19 +781,17 @@ class Scanner:
if provider_result is None: if provider_result is None:
provider_successful = False provider_successful = False
elif not self._is_stop_requested(): elif not self._is_stop_requested():
# Pass all relationships to be processed
discovered, is_large_entity = self._process_provider_result_unified( discovered, is_large_entity = self._process_provider_result_unified(
target, provider, provider_result, depth target, provider, provider_result, depth
) )
if is_large_entity:
large_entity_members.update(discovered)
else:
new_targets.update(discovered) new_targets.update(discovered)
except Exception as e: except Exception as e:
provider_successful = False provider_successful = False
self._log_provider_error(target, provider.get_name(), str(e)) self._log_provider_error(target, provider.get_name(), str(e))
return new_targets, large_entity_members, provider_successful return new_targets, set(), provider_successful
def _execute_provider_query(self, provider: BaseProvider, target: str, is_ip: bool) -> Optional[ProviderResult]: def _execute_provider_query(self, provider: BaseProvider, target: str, is_ip: bool) -> Optional[ProviderResult]:
""" """
@ -824,73 +821,158 @@ class Scanner:
self._update_provider_state(target, provider_name, 'failed', 0, str(e), start_time) self._update_provider_state(target, provider_name, 'failed', 0, str(e), start_time)
return None return None
def _create_large_entity_from_result(self, source_node: str, provider_name: str,
provider_result: ProviderResult, depth: int) -> Tuple[str, Set[str]]:
"""
Creates a large entity node, tags all member nodes, and returns its ID and members.
"""
members = {rel.target_node for rel in provider_result.relationships
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)}
if not members:
return "", set()
large_entity_id = f"le_{provider_name}_{source_node}"
self.graph.add_node(
node_id=large_entity_id,
node_type=NodeType.LARGE_ENTITY,
attributes=[
{"name": "count", "value": len(members), "type": "statistic"},
{"name": "source_provider", "value": provider_name, "type": "metadata"},
{"name": "discovery_depth", "value": depth, "type": "metadata"},
{"name": "nodes", "value": list(members), "type": "metadata"}
],
description=f"A collection of {len(members)} nodes discovered from {source_node} via {provider_name}."
)
for member_id in members:
node_type = NodeType.IP if _is_valid_ip(member_id) else NodeType.DOMAIN
self.graph.add_node(
node_id=member_id,
node_type=node_type,
metadata={'large_entity_id': large_entity_id}
)
return large_entity_id, members
def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool:
"""
Removes a node from a large entity, allowing it to be processed normally.
"""
if not self.graph.graph.has_node(node_id):
return False
node_data = self.graph.graph.nodes[node_id]
metadata = node_data.get('metadata', {})
if metadata.get('large_entity_id') == large_entity_id:
# Remove the large entity tag
del metadata['large_entity_id']
self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata)
# Re-enqueue the node for full processing
is_ip = _is_valid_ip(node_id)
eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
for provider in eligible_providers:
provider_name = provider.get_name()
priority = self._get_priority(provider_name)
# Use current depth of the large entity if available, else 0
depth = 0
if self.graph.graph.has_node(large_entity_id):
le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
if depth_attr:
depth = depth_attr['value']
self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
self.total_tasks_ever_enqueued += 1
return True
return False
def _process_provider_result_unified(self, target: str, provider: BaseProvider, def _process_provider_result_unified(self, target: str, provider: BaseProvider,
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
""" """
Process a unified ProviderResult object to update the graph. Process a unified ProviderResult object to update the graph.
Handles large entity creation while ensuring all underlying nodes and edges are This version dynamically re-routes edges to a large entity container.
added to the graph data model for a complete dataset.
""" """
provider_name = provider.get_name() provider_name = provider.get_name()
discovered_targets = set() discovered_targets = set()
large_entity_id = ""
large_entity_members = set() large_entity_members = set()
if self._is_stop_requested(): if self._is_stop_requested():
return discovered_targets, False return discovered_targets, False
# Check if a large entity should be created based on the count of domain/IP relationships eligible_rel_count = sum(
eligible_relationship_count = sum(
1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node) 1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
) )
is_large_entity = eligible_rel_count > self.config.large_entity_threshold
is_large_entity = eligible_relationship_count > self.config.large_entity_threshold
if is_large_entity: if is_large_entity:
# Create the large entity node and get the set of its members large_entity_id, large_entity_members = self._create_large_entity_from_result(
large_entity_members = self._create_large_entity_from_provider_result(
target, provider_name, provider_result, current_depth target, provider_name, provider_result, current_depth
) )
# Process ALL relationships to build the complete underlying data model
for i, relationship in enumerate(provider_result.relationships): for i, relationship in enumerate(provider_result.relationships):
if i % 5 == 0 and self._is_stop_requested(): if i % 5 == 0 and self._is_stop_requested():
break break
source_node = relationship.source_node source_node_id = relationship.source_node
target_node = relationship.target_node target_node_id = relationship.target_node
# Determine node types # Determine visual source and target, substituting with large entity ID if necessary
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN visual_source = large_entity_id if source_node_id in large_entity_members else source_node_id
visual_target = large_entity_id if target_node_id in large_entity_members else target_node_id
# Prevent self-loops on the large entity node
if visual_source == visual_target:
continue
# Determine node types for the actual nodes
source_type = NodeType.IP if _is_valid_ip(source_node_id) else NodeType.DOMAIN
if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp': if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
target_type = NodeType.ISP target_type = NodeType.ISP
elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer': elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
target_type = NodeType.CA target_type = NodeType.CA
elif provider_name == 'correlation': elif provider_name == 'correlation':
target_type = NodeType.CORRELATION_OBJECT target_type = NodeType.CORRELATION_OBJECT
elif _is_valid_ip(target_node): elif _is_valid_ip(target_node_id):
target_type = NodeType.IP target_type = NodeType.IP
else: else:
target_type = NodeType.DOMAIN target_type = NodeType.DOMAIN
max_depth_reached = current_depth >= self.max_depth max_depth_reached = current_depth >= self.max_depth
# Add all nodes and edges to the graph's data model. # Add actual nodes to the graph (they might be hidden by the UI)
# The frontend will handle the visual re-routing for large entity members. self.graph.add_node(source_node_id, source_type)
self.graph.add_node(source_node, source_type) self.graph.add_node(target_node_id, target_type, metadata={'max_depth_reached': max_depth_reached})
self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached})
# Add the visual edge to the graph
self.graph.add_edge( self.graph.add_edge(
source_node, target_node, visual_source, visual_target,
relationship.relationship_type, relationship.relationship_type,
relationship.confidence, relationship.confidence,
provider_name, provider_name,
relationship.raw_data relationship.raw_data
) )
# Add all discovered domains/IPs to be considered for further processing if (_is_valid_domain(target_node_id) or _is_valid_ip(target_node_id)) and not max_depth_reached:
if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached: if target_node_id not in large_entity_members:
discovered_targets.add(target_node) discovered_targets.add(target_node_id)
if large_entity_members:
self.logger.logger.info(f"Enqueuing DNS and Correlation for {len(large_entity_members)} members of {large_entity_id}")
for member in large_entity_members:
for provider_name_to_run in ['dns', 'correlation']:
p_instance = next((p for p in self.providers if p.get_name() == provider_name_to_run), None)
if p_instance and p_instance.get_eligibility().get('domains' if _is_valid_domain(member) else 'ips'):
priority = self._get_priority(provider_name_to_run)
self.task_queue.put((time.time(), priority, (provider_name_to_run, member, current_depth)))
self.total_tasks_ever_enqueued += 1
# Process all attributes and add them to the corresponding nodes
attributes_by_node = defaultdict(list) attributes_by_node = defaultdict(list)
for attribute in provider_result.attributes: for attribute in provider_result.attributes:
attr_dict = { attr_dict = {
@ -909,65 +991,6 @@ class Scanner:
return discovered_targets, is_large_entity return discovered_targets, is_large_entity
def _create_large_entity_from_provider_result(self, source: str, provider_name: str,
provider_result: ProviderResult, current_depth: int) -> Set[str]:
"""
Create a large entity node and connect it to the source and any shared
non-member nodes like CAs or ISPs.
"""
entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}"
members = {
rel.target_node for rel in provider_result.relationships
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
}
if not members:
return set()
first_member = next(iter(members))
node_type = 'ip' if _is_valid_ip(first_member) else 'domain'
attributes_dict = {
'count': len(members),
'nodes': list(members),
'node_type': node_type,
'source_provider': provider_name,
'discovery_depth': current_depth,
'threshold_exceeded': self.config.large_entity_threshold,
}
attributes_list = [
{
"name": key, "value": value, "type": "large_entity_info",
"provider": provider_name, "confidence": 0.9, "metadata": {}
} for key, value in attributes_dict.items()
]
description = f'Large entity created due to {len(members)} relationships from {provider_name}'
self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description)
# Add a representative edge from the source to the large entity
if provider_result.relationships:
rep_rel = provider_result.relationships[0]
self.graph.add_edge(source, entity_id, rep_rel.relationship_type, 0.9, provider_name,
{'large_entity_info': f'Contains {len(members)} {node_type}s'})
# Create edges from the large entity to shared non-member nodes (e.g., CAs, ISPs)
processed_targets = set()
for rel in provider_result.relationships:
if rel.source_node in members and rel.target_node not in members:
if rel.target_node not in processed_targets:
self.graph.add_edge(
entity_id, rel.target_node, rel.relationship_type, rel.confidence,
rel.provider, rel.raw_data
)
processed_targets.add(rel.target_node)
self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(members)} targets from {provider_name}")
return members
def stop_scan(self) -> bool: def stop_scan(self) -> bool:
"""Request immediate scan termination with proper cleanup.""" """Request immediate scan termination with proper cleanup."""
try: try:
@ -995,127 +1018,6 @@ class Scanner:
traceback.print_exc() traceback.print_exc()
return False return False
def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
"""
Extracts a node from a large entity, restores ALL of its original connections,
and re-queues it for scanning.
"""
if not self.graph.graph.has_node(large_entity_id):
return False
# Extract the node from the large entity's internal list
success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
if not success:
return False
# Restore all incoming and outgoing edges for the extracted node
# These edges already exist in the graph data model; this ensures they are "activated"
# for the frontend.
for u, v, data in self.graph.graph.in_edges(node_id_to_extract, data=True):
self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
data.get('source_provider'), data.get('raw_data'))
for u, v, data in self.graph.graph.out_edges(node_id_to_extract, data=True):
self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
data.get('source_provider'), data.get('raw_data'))
# Re-queue the extracted node for further scanning if it is a domain or IP
is_ip = _is_valid_ip(node_id_to_extract)
is_domain = _is_valid_domain(node_id_to_extract)
if is_domain or is_ip:
large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', [])
discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None)
current_depth = discovery_depth_attr['value'] if discovery_depth_attr else 0
eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
for provider in eligible_providers:
# Exclude DNS and correlation providers from re-processing
if provider.get_name() not in ['dns', 'correlation']:
provider_name = provider.get_name()
priority = self._get_priority(provider_name)
self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
self.total_tasks_ever_enqueued += 1
if self.status != ScanStatus.RUNNING:
self.status = ScanStatus.RUNNING
self._update_session_state()
if not self.scan_thread or not self.scan_thread.is_alive():
self.scan_thread = threading.Thread(
target=self._execute_scan,
args=(self.current_target, self.max_depth),
daemon=True
)
self.scan_thread.start()
else:
self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}")
return True
def _determine_extracted_node_type(self, node_id: str, large_entity_id: str) -> NodeType:
"""
FIXED: Determine the correct node type for a node being extracted from a large entity.
Uses multiple strategies to ensure accurate type detection.
"""
from utils.helpers import _is_valid_ip, _is_valid_domain
# Strategy 1: Check if node already exists in graph with a type
if self.graph.has_node(node_id):
existing_type = self.graph.nodes[node_id].get('type')
if existing_type:
try:
return NodeType(existing_type)
except ValueError:
pass
# Strategy 2: Look for existing relationships to this node to infer type
for source, target, edge_data in self.graph.edges(data=True):
if target == node_id:
rel_type = edge_data.get('relationship_type', '')
provider = edge_data.get('source_provider', '')
# CA nodes from certificate issuer relationships
if provider == 'crtsh' and rel_type == 'crtsh_cert_issuer':
return NodeType.CA
# ISP nodes from Shodan
if provider == 'shodan' and rel_type == 'shodan_isp':
return NodeType.ISP
# Correlation objects
if rel_type.startswith('corr_'):
return NodeType.CORRELATION_OBJECT
if source == node_id:
rel_type = edge_data.get('relationship_type', '')
provider = edge_data.get('source_provider', '')
# Source nodes in cert issuer relationships are CAs
if provider == 'crtsh' and rel_type == 'crtsh_cert_issuer':
return NodeType.CA
# Strategy 3: Format-based detection (fallback)
if _is_valid_ip(node_id):
return NodeType.IP
elif _is_valid_domain(node_id):
return NodeType.DOMAIN
# Strategy 4: Check large entity context
if self.graph.has_node(large_entity_id):
large_entity_data = self.graph.nodes[large_entity_id]
attributes = large_entity_data.get('attributes', [])
node_type_attr = next((attr for attr in attributes if attr.get('name') == 'node_type'), None)
if node_type_attr:
entity_node_type = node_type_attr.get('value', 'domain')
if entity_node_type == 'ip':
return NodeType.IP
else:
return NodeType.DOMAIN
# Final fallback
return NodeType.DOMAIN
def _update_session_state(self) -> None: def _update_session_state(self) -> None:
""" """
Update the scanner state in Redis for GUI updates. Update the scanner state in Redis for GUI updates.
@ -1186,8 +1088,19 @@ class Scanner:
eligible = [] eligible = []
target_key = 'ips' if is_ip else 'domains' target_key = 'ips' if is_ip else 'domains'
# Check if the target is part of a large entity
is_in_large_entity = False
if self.graph.graph.has_node(target):
metadata = self.graph.graph.nodes[target].get('metadata', {})
if 'large_entity_id' in metadata:
is_in_large_entity = True
for provider in self.providers: for provider in self.providers:
try: try:
# If in large entity, only allow dns and correlation providers
if is_in_large_entity and provider.get_name() not in ['dns', 'correlation']:
continue
# Check if provider supports this target type # Check if provider supports this target type
if not provider.get_eligibility().get(target_key, False): if not provider.get_eligibility().get(target_key, False):
continue continue

View File

@ -2,15 +2,37 @@
import json import json
import re import re
import psycopg2
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any, Set from typing import List, Dict, Any, Set, Optional
from urllib.parse import quote from urllib.parse import quote
from datetime import datetime, timezone from datetime import datetime, timezone
import requests import requests
from psycopg2 import pool
from .base_provider import BaseProvider from .base_provider import BaseProvider
from core.provider_result import ProviderResult from core.provider_result import ProviderResult
from utils.helpers import _is_valid_domain from utils.helpers import _is_valid_domain
from core.logger import get_forensic_logger
# --- Global Instance for PostgreSQL Connection Pool ---
# This pool will be created once per worker process and is not part of the
# CrtShProvider instance, thus avoiding pickling errors.
db_pool = None
try:
db_pool = psycopg2.pool.SimpleConnectionPool(
1, 5,
host='crt.sh',
port=5432,
user='guest',
dbname='certwatch',
sslmode='prefer',
connect_timeout=60
)
# Use a generic logger here as this is at the module level
get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.")
except Exception as e:
get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.")
class CrtShProvider(BaseProvider): class CrtShProvider(BaseProvider):
@ -121,7 +143,7 @@ class CrtShProvider(BaseProvider):
else: # "stale" or "not_found" else: # "stale" or "not_found"
# Query the API for the latest certificates # Query the API for the latest certificates
new_raw_certs = self._query_crtsh_api(domain) new_raw_certs = self._query_crtsh(domain)
if self._stop_event and self._stop_event.is_set(): if self._stop_event and self._stop_event.is_set():
return ProviderResult() return ProviderResult()
@ -152,8 +174,8 @@ class CrtShProvider(BaseProvider):
# Save the new result and the raw data to the cache # Save the new result and the raw data to the cache
self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain)
except requests.exceptions.RequestException as e: except (requests.exceptions.RequestException, psycopg2.Error) as e:
self.logger.logger.error(f"API query failed for {domain}: {e}") self.logger.logger.error(f"Upstream query failed for {domain}: {e}")
if cache_status != "not_found": if cache_status != "not_found":
result = self._load_from_cache(cache_file) result = self._load_from_cache(cache_file)
self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.") self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.")
@ -256,6 +278,58 @@ class CrtShProvider(BaseProvider):
except Exception as e: except Exception as e:
self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}") self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}")
def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh, trying the database first and falling back to the API."""
global db_pool
if db_pool:
try:
self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}")
return self._query_crtsh_db(domain)
except psycopg2.Error as e:
self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.")
return self._query_crtsh_api(domain)
else:
self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}")
return self._query_crtsh_api(domain)
def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh database for raw certificate data."""
global db_pool
conn = db_pool.getconn()
try:
with conn.cursor() as cursor:
query = """
SELECT
c.id,
x509_serialnumber(c.certificate) as serial_number,
x509_notbefore(c.certificate) as not_before,
x509_notafter(c.certificate) as not_after,
c.issuer_ca_id,
ca.name as issuer_name,
x509_commonname(c.certificate) as common_name,
identities(c.certificate)::text as name_value
FROM certificate c
LEFT JOIN ca ON c.issuer_ca_id = ca.id
WHERE identities(c.certificate) @@ plainto_tsquery(%s)
ORDER BY c.id DESC
LIMIT 5000;
"""
cursor.execute(query, (domain,))
results = []
columns = [desc[0] for desc in cursor.description]
for row in cursor.fetchall():
row_dict = dict(zip(columns, row))
if row_dict.get('not_before'):
row_dict['not_before'] = row_dict['not_before'].isoformat()
if row_dict.get('not_after'):
row_dict['not_after'] = row_dict['not_after'].isoformat()
results.append(row_dict)
self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.")
return results
finally:
db_pool.putconn(conn)
def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]: def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh API for raw certificate data.""" """Query crt.sh API for raw certificate data."""
url = f"{self.base_url}?q={quote(domain)}&output=json" url = f"{self.base_url}?q={quote(domain)}&output=json"
@ -286,6 +360,17 @@ class CrtShProvider(BaseProvider):
self.logger.logger.info(f"CrtSh processing cancelled before processing for domain: {query_domain}") self.logger.logger.info(f"CrtSh processing cancelled before processing for domain: {query_domain}")
return result return result
incompleteness_warning = self._check_for_incomplete_data(query_domain, certificates)
if incompleteness_warning:
result.add_attribute(
target_node=query_domain,
name="crtsh_data_warning",
value=incompleteness_warning,
attr_type='metadata',
provider=self.name,
confidence=1.0
)
all_discovered_domains = set() all_discovered_domains = set()
processed_issuers = set() processed_issuers = set()
@ -457,6 +542,8 @@ class CrtShProvider(BaseProvider):
raise ValueError("Empty date string") raise ValueError("Empty date string")
try: try:
if isinstance(date_string, datetime):
return date_string.replace(tzinfo=timezone.utc)
if date_string.endswith('Z'): if date_string.endswith('Z'):
return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc) return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc)
elif '+' in date_string or date_string.endswith('UTC'): elif '+' in date_string or date_string.endswith('UTC'):
@ -578,3 +665,29 @@ class CrtShProvider(BaseProvider):
return 'parent_domain' return 'parent_domain'
else: else:
return 'related_domain' return 'related_domain'
def _check_for_incomplete_data(self, domain: str, certificates: List[Dict[str, Any]]) -> Optional[str]:
"""
Analyzes the certificate list to heuristically detect if the data from crt.sh is incomplete.
"""
cert_count = len(certificates)
# Heuristic 1: Check if the number of certs hits a known hard limit.
if cert_count >= 10000:
return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit."
# Heuristic 2: Check if all returned certificates are old.
if cert_count > 1000: # Only apply this for a reasonable number of certs
latest_expiry = None
for cert in certificates:
try:
not_after = self._parse_certificate_date(cert.get('not_after'))
if latest_expiry is None or not_after > latest_expiry:
latest_expiry = not_after
except (ValueError, TypeError):
continue
if latest_expiry and (datetime.now(timezone.utc) - latest_expiry).days > 365:
return f"Incomplete data suspected: The latest certificate expired more than a year ago ({latest_expiry.strftime('%Y-%m-%d')})."
return None

View File

@ -8,3 +8,4 @@ dnspython
gunicorn gunicorn
redis redis
python-dotenv python-dotenv
psycopg2-binary

View File

@ -1,3 +1,4 @@
// dnsrecon-reduced/static/js/graph.js
/** /**
* Graph visualization module for DNSRecon * Graph visualization module for DNSRecon
* Handles network graph rendering using vis.js with proper large entity node hiding * Handles network graph rendering using vis.js with proper large entity node hiding
@ -362,100 +363,60 @@ class GraphManager {
} }
try { try {
// Initialize if not already done
if (!this.isInitialized) { if (!this.isInitialized) {
this.initialize(); this.initialize();
} }
this.initialTargetIds = new Set(graphData.initial_targets || []); this.initialTargetIds = new Set(graphData.initial_targets || []);
// Check if we have actual data to display
const hasData = graphData.nodes.length > 0 || graphData.edges.length > 0; const hasData = graphData.nodes.length > 0 || graphData.edges.length > 0;
// Handle placeholder visibility
const placeholder = this.container.querySelector('.graph-placeholder'); const placeholder = this.container.querySelector('.graph-placeholder');
if (placeholder) { if (placeholder) {
if (hasData) { placeholder.style.display = hasData ? 'none' : 'flex';
placeholder.style.display = 'none'; }
} else { if (!hasData) {
placeholder.style.display = 'flex'; this.nodes.clear();
// Early return if no data to process this.edges.clear();
return; return;
} }
}
this.largeEntityMembers.clear(); const nodeMap = new Map(graphData.nodes.map(node => [node.id, node]));
const largeEntityMap = new Map();
graphData.nodes.forEach(node => { // Filter out hidden nodes before processing for rendering
if (node.type === 'large_entity' && node.attributes) { const filteredNodes = graphData.nodes.filter(node =>
const nodesAttribute = this.findAttributeByName(node.attributes, 'nodes'); !(node.metadata && node.metadata.large_entity_id)
if (nodesAttribute && Array.isArray(nodesAttribute.value)) { );
nodesAttribute.value.forEach(nodeId => {
largeEntityMap.set(nodeId, node.id);
this.largeEntityMembers.add(nodeId);
});
}
}
});
const filteredNodes = graphData.nodes.filter(node => { const processedNodes = graphData.nodes.map(node => {
return !this.largeEntityMembers.has(node.id) || node.type === 'large_entity';
});
console.log(`Filtered ${graphData.nodes.length - filteredNodes.length} large entity member nodes from visualization`);
// Process nodes with proper certificate coloring
const processedNodes = filteredNodes.map(node => {
const processed = this.processNode(node); const processed = this.processNode(node);
if (node.metadata && node.metadata.large_entity_id) {
// Apply certificate-based coloring here in frontend processed.hidden = true;
if (node.type === 'domain' && Array.isArray(node.attributes)) {
const certInfo = this.analyzeCertificateInfo(node.attributes);
if (certInfo.hasExpiredOnly) {
// Red for domains with only expired/invalid certificates
processed.color = { background: '#ff6b6b', border: '#cc5555' };
} else if (!certInfo.hasCertificates) {
// Grey for domains with no certificates
processed.color = { background: '#c7c7c7', border: '#999999' };
}
// Valid certificates use default green (handled by processNode)
}
return processed;
});
const mergedEdges = {};
graphData.edges.forEach(edge => {
const fromNode = largeEntityMap.has(edge.from) ? largeEntityMap.get(edge.from) : edge.from;
const toNode = largeEntityMap.has(edge.to) ? largeEntityMap.get(edge.to) : edge.to;
const mergeKey = `${fromNode}-${toNode}-${edge.label}`;
if (!mergedEdges[mergeKey]) {
mergedEdges[mergeKey] = {
...edge,
from: fromNode,
to: toNode,
count: 0,
confidence_score: 0
};
}
mergedEdges[mergeKey].count++;
if (edge.confidence_score > mergedEdges[mergeKey].confidence_score) {
mergedEdges[mergeKey].confidence_score = edge.confidence_score;
}
});
const processedEdges = Object.values(mergedEdges).map(edge => {
const processed = this.processEdge(edge);
if (edge.count > 1) {
processed.label = `${edge.label} (${edge.count})`;
} }
return processed; return processed;
}); });
// Update datasets with animation const processedEdges = graphData.edges.map(edge => {
let fromNode = nodeMap.get(edge.from);
let toNode = nodeMap.get(edge.to);
let fromId = edge.from;
let toId = edge.to;
if (fromNode && fromNode.metadata && fromNode.metadata.large_entity_id) {
fromId = fromNode.metadata.large_entity_id;
}
if (toNode && toNode.metadata && toNode.metadata.large_entity_id) {
toId = toNode.metadata.large_entity_id;
}
// Avoid self-referencing edges from re-routing
if (fromId === toId) {
return null;
}
const reRoutedEdge = { ...edge, from: fromId, to: toId };
return this.processEdge(reRoutedEdge);
}).filter(Boolean); // Remove nulls from self-referencing edges
const existingNodeIds = this.nodes.getIds(); const existingNodeIds = this.nodes.getIds();
const existingEdgeIds = this.edges.getIds(); const existingEdgeIds = this.edges.getIds();
@ -472,13 +433,10 @@ class GraphManager {
setTimeout(() => this.highlightNewElements(newNodes, newEdges), 100); setTimeout(() => this.highlightNewElements(newNodes, newEdges), 100);
} }
if (processedNodes.length <= 10 || existingNodeIds.length === 0) { if (this.nodes.length <= 10 || existingNodeIds.length === 0) {
setTimeout(() => this.fitView(), 800); setTimeout(() => this.fitView(), 800);
} }
console.log(`Graph updated: ${processedNodes.length} nodes, ${processedEdges.length} edges (${newNodes.length} new nodes, ${newEdges.length} new edges)`);
console.log(`Large entity members hidden: ${this.largeEntityMembers.size}`);
} catch (error) { } catch (error) {
console.error('Failed to update graph:', error); console.error('Failed to update graph:', error);
this.showError('Failed to update visualization'); this.showError('Failed to update visualization');
@ -606,7 +564,7 @@ class GraphManager {
processEdge(edge) { processEdge(edge) {
const confidence = edge.confidence_score || 0; const confidence = edge.confidence_score || 0;
const processedEdge = { const processedEdge = {
id: `${edge.from}-${edge.to}`, id: `${edge.from}-${edge.to}-${edge.label}`,
from: edge.from, from: edge.from,
to: edge.to, to: edge.to,
label: this.formatEdgeLabel(edge.label, confidence), label: this.formatEdgeLabel(edge.label, confidence),
@ -1053,7 +1011,7 @@ class GraphManager {
this.nodes.clear(); this.nodes.clear();
this.edges.clear(); this.edges.clear();
this.history = []; this.history = [];
this.largeEntityMembers.clear(); // Clear large entity tracking this.largeEntityMembers.clear();
this.initialTargetIds.clear(); this.initialTargetIds.clear();
// Show placeholder // Show placeholder
@ -1211,7 +1169,6 @@ class GraphManager {
const basicStats = { const basicStats = {
nodeCount: this.nodes.length, nodeCount: this.nodes.length,
edgeCount: this.edges.length, edgeCount: this.edges.length,
largeEntityMembersHidden: this.largeEntityMembers.size
}; };
// Add forensic statistics // Add forensic statistics
@ -1608,14 +1565,43 @@ class GraphManager {
} }
/** /**
* Unhide all hidden nodes * FIXED: Unhide all hidden nodes, excluding large entity members and disconnected nodes.
* This prevents orphaned large entity members from appearing as free-floating nodes.
*/ */
unhideAll() { unhideAll() {
const allNodes = this.nodes.get({ const allHiddenNodes = this.nodes.get({
filter: (node) => node.hidden === true filter: (node) => {
// Skip nodes that are part of a large entity
if (node.metadata && node.metadata.large_entity_id) {
return false;
}
// Skip nodes that are not hidden
if (node.hidden !== true) {
return false;
}
// Skip nodes that have no edges (would appear disconnected)
const nodeId = node.id;
const hasIncomingEdges = this.edges.get().some(edge => edge.to === nodeId && !edge.hidden);
const hasOutgoingEdges = this.edges.get().some(edge => edge.from === nodeId && !edge.hidden);
if (!hasIncomingEdges && !hasOutgoingEdges) {
console.log(`Skipping disconnected node ${nodeId} from unhide`);
return false;
}
return true;
}
}); });
const updates = allNodes.map(node => ({ id: node.id, hidden: false }));
if (allHiddenNodes.length > 0) {
console.log(`Unhiding ${allHiddenNodes.length} nodes with valid connections`);
const updates = allHiddenNodes.map(node => ({ id: node.id, hidden: false }));
this.nodes.update(updates); this.nodes.update(updates);
} else {
console.log('No eligible nodes to unhide');
}
} }
} }

View File

@ -1397,11 +1397,32 @@ class DNSReconApp {
} }
/** /**
* UPDATED: Generate details for standard nodes with organized attribute grouping * UPDATED: Generate details for standard nodes with organized attribute grouping and data warnings
*/ */
generateStandardNodeDetails(node) { generateStandardNodeDetails(node) {
let html = ''; let html = '';
// Check for and display a crt.sh data warning if it exists
const crtshWarningAttr = this.findAttributeByName(node.attributes, 'crtsh_data_warning');
if (crtshWarningAttr) {
html += `
<div class="modal-section" style="border-left: 3px solid #ff9900; background: rgba(255, 153, 0, 0.05);">
<details open>
<summary style="color: #ff9900;">
<span> Data Integrity Warning</span>
</summary>
<div class="modal-section-content">
<p class="placeholder-subtext" style="color: #e0e0e0; font-size: 0.8rem; line-height: 1.5;">
${this.escapeHtml(crtshWarningAttr.value)}
<br><br>
This can occur for very large domains (e.g., google.com) where crt.sh may return a limited subset of all available certificates. As a result, the certificate status may not be fully representative.
</p>
</div>
</details>
</div>
`;
}
// Relationships sections // Relationships sections
html += this.generateRelationshipsSection(node); html += this.generateRelationshipsSection(node);
@ -1419,6 +1440,19 @@ class DNSReconApp {
return html; return html;
} }
/**
* Helper method to find an attribute by name in the standardized attributes list
* @param {Array} attributes - List of StandardAttribute objects
* @param {string} name - Attribute name to find
* @returns {Object|null} The attribute object if found, null otherwise
*/
findAttributeByName(attributes, name) {
if (!Array.isArray(attributes)) {
return null;
}
return attributes.find(attr => attr.name === name) || null;
}
generateOrganizedAttributesSection(attributes, nodeType) { generateOrganizedAttributesSection(attributes, nodeType) {
if (!Array.isArray(attributes) || attributes.length === 0) { if (!Array.isArray(attributes) || attributes.length === 0) {
return ''; return '';
@ -1997,8 +2031,6 @@ class DNSReconApp {
if (response.success) { if (response.success) {
this.showSuccess(response.message); this.showSuccess(response.message);
this.hideModal();
// If the scanner was idle, it's now running. Start polling to see the new node appear. // If the scanner was idle, it's now running. Start polling to see the new node appear.
if (this.scanStatus === 'idle') { if (this.scanStatus === 'idle') {
this.startPolling(1000); this.startPolling(1000);