attempt fix large entity

This commit is contained in:
overcuriousity 2025-09-18 23:22:49 +02:00
parent 95cebbf935
commit 1558731c1c

View File

@ -825,81 +825,33 @@ class Scanner:
return None return None
def _process_provider_result_unified(self, target: str, provider: BaseProvider, def _process_provider_result_unified(self, target: str, provider: BaseProvider,
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
""" """
Process a unified ProviderResult object to update the graph. Process a unified ProviderResult object to update the graph.
FIXED: Ensure CA and ISP relationships are created even when large entities are formed. Handles large entity creation while ensuring all underlying nodes and edges are
added to the graph data model for a complete dataset.
""" """
provider_name = provider.get_name() provider_name = provider.get_name()
discovered_targets = set() discovered_targets = set()
large_entity_members = set()
if self._is_stop_requested(): if self._is_stop_requested():
return discovered_targets, False return discovered_targets, False
# Check if this should be a large entity (only counting domain/IP relationships) # Check if a large entity should be created based on the count of domain/IP relationships
eligible_relationship_count = 0 eligible_relationship_count = sum(
for rel in provider_result.relationships: 1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
# Only count relationships that would go into large entities )
if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
continue # Don't count CA relationships is_large_entity = eligible_relationship_count > self.config.large_entity_threshold
if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
continue # Don't count ISP relationships if is_large_entity:
if rel.relationship_type.startswith('corr_'): # Create the large entity node and get the set of its members
continue # Don't count correlation relationships large_entity_members = self._create_large_entity_from_provider_result(
target, provider_name, provider_result, current_depth
# Only count domain/IP targets )
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node):
eligible_relationship_count += 1 # Process ALL relationships to build the complete underlying data model
if eligible_relationship_count > self.config.large_entity_threshold:
# Create large entity but ALSO process special relationships
members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth)
# FIXED: Still process CA, ISP, and correlation relationships directly on the graph
for relationship in provider_result.relationships:
if self._is_stop_requested():
break
source_node = relationship.source_node
target_node = relationship.target_node
# Process special relationship types that should appear directly on graph
should_create_direct_relationship = False
target_type = None
if provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
target_type = NodeType.CA
should_create_direct_relationship = True
elif provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
target_type = NodeType.ISP
should_create_direct_relationship = True
elif relationship.relationship_type.startswith('corr_'):
target_type = NodeType.CORRELATION_OBJECT
should_create_direct_relationship = True
if should_create_direct_relationship:
# Create source and target nodes
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
self.graph.add_node(source_node, source_type)
self.graph.add_node(target_node, target_type)
# Add the relationship edge
self.graph.add_edge(
source_node, target_node,
relationship.relationship_type,
relationship.confidence,
provider_name,
relationship.raw_data
)
# Add to discovered targets if it's a valid target for further processing
max_depth_reached = current_depth >= self.max_depth
if not max_depth_reached and (_is_valid_domain(target_node) or _is_valid_ip(target_node)):
discovered_targets.add(target_node)
return members, True
# Normal processing (existing logic) when not creating large entity
for i, relationship in enumerate(provider_result.relationships): for i, relationship in enumerate(provider_result.relationships):
if i % 5 == 0 and self._is_stop_requested(): if i % 5 == 0 and self._is_stop_requested():
break break
@ -907,10 +859,8 @@ class Scanner:
source_node = relationship.source_node source_node = relationship.source_node
target_node = relationship.target_node target_node = relationship.target_node
# Determine source node type # Determine node types
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
# Determine target node type based on provider and relationship
if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp': if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
target_type = NodeType.ISP target_type = NodeType.ISP
elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer': elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
@ -921,132 +871,102 @@ class Scanner:
target_type = NodeType.IP target_type = NodeType.IP
else: else:
target_type = NodeType.DOMAIN target_type = NodeType.DOMAIN
# Add max_depth_reached flag
max_depth_reached = current_depth >= self.max_depth max_depth_reached = current_depth >= self.max_depth
# Create or update nodes with proper types # Add all nodes and edges to the graph's data model.
# The frontend will handle the visual re-routing for large entity members.
self.graph.add_node(source_node, source_type) self.graph.add_node(source_node, source_type)
self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached}) self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached})
self.graph.add_edge(
# Add the relationship edge
if self.graph.add_edge(
source_node, target_node, source_node, target_node,
relationship.relationship_type, relationship.relationship_type,
relationship.confidence, relationship.confidence,
provider_name, provider_name,
relationship.raw_data relationship.raw_data
): )
pass # Edge was successfully added
# Add target to discovered nodes for further processing # Add all discovered domains/IPs to be considered for further processing
if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached: if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached:
discovered_targets.add(target_node) discovered_targets.add(target_node)
# Process all attributes (existing logic unchanged) # Process all attributes and add them to the corresponding nodes
attributes_by_node = defaultdict(list) attributes_by_node = defaultdict(list)
for attribute in provider_result.attributes: for attribute in provider_result.attributes:
attr_dict = { attr_dict = {
"name": attribute.name, "name": attribute.name, "value": attribute.value, "type": attribute.type,
"value": attribute.value, "provider": attribute.provider, "confidence": attribute.confidence, "metadata": attribute.metadata
"type": attribute.type,
"provider": attribute.provider,
"confidence": attribute.confidence,
"metadata": attribute.metadata
} }
attributes_by_node[attribute.target_node].append(attr_dict) attributes_by_node[attribute.target_node].append(attr_dict)
# Add attributes to existing nodes OR create new nodes if they don't exist
for node_id, node_attributes_list in attributes_by_node.items(): for node_id, node_attributes_list in attributes_by_node.items():
if not self.graph.graph.has_node(node_id): if not self.graph.graph.has_node(node_id):
node_type = NodeType.IP if _is_valid_ip(node_id) else NodeType.DOMAIN node_type = NodeType.IP if _is_valid_ip(node_id) else NodeType.DOMAIN
self.graph.add_node(node_id, node_type, attributes=node_attributes_list) self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
else: else:
node_type_val = self.graph.graph.nodes[node_id].get('type', 'domain') existing_attrs = self.graph.graph.nodes[node_id].get('attributes', [])
self.graph.add_node(node_id, NodeType(node_type_val), attributes=node_attributes_list) self.graph.graph.nodes[node_id]['attributes'] = existing_attrs + node_attributes_list
return discovered_targets, False return discovered_targets, is_large_entity
def _create_large_entity_from_provider_result(self, source: str, provider_name: str, def _create_large_entity_from_provider_result(self, source: str, provider_name: str,
provider_result: ProviderResult, current_depth: int) -> Set[str]: provider_result: ProviderResult, current_depth: int) -> Set[str]:
""" """
Create a large entity node from a ProviderResult. Create a large entity node and connect it to the source and any shared
FIXED: Only include domain/IP nodes in large entities, exclude CA and other special node types. non-member nodes like CAs or ISPs.
""" """
entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}" entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}"
# FIXED: Filter out CA, ISP, and correlation nodes from large entity inclusion members = {
eligible_targets = [] rel.target_node for rel in provider_result.relationships
for rel in provider_result.relationships: if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
target_node = rel.target_node }
# Skip CA nodes (certificate issuers) - they should appear directly on graph if not members:
if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
continue
# Skip ISP nodes - they should appear directly on graph
if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
continue
# Skip correlation objects - they should appear directly on graph
if rel.relationship_type.startswith('corr_'):
continue
# Only include valid domains and IPs in large entities
if _is_valid_domain(target_node) or _is_valid_ip(target_node):
eligible_targets.append(target_node)
# If no eligible targets after filtering, don't create large entity
if not eligible_targets:
return set() return set()
node_type = 'unknown' first_member = next(iter(members))
if eligible_targets: node_type = 'ip' if _is_valid_ip(first_member) else 'domain'
if _is_valid_domain(eligible_targets[0]):
node_type = 'domain'
elif _is_valid_ip(eligible_targets[0]):
node_type = 'ip'
# Create individual nodes for eligible targets
for target in eligible_targets:
target_node_type = NodeType.DOMAIN if node_type == 'domain' else NodeType.IP
self.graph.add_node(target, target_node_type)
attributes_dict = { attributes_dict = {
'count': len(eligible_targets), 'count': len(members),
'nodes': eligible_targets, # Only eligible domain/IP nodes 'nodes': list(members),
'node_type': node_type, 'node_type': node_type,
'source_provider': provider_name, 'source_provider': provider_name,
'discovery_depth': current_depth, 'discovery_depth': current_depth,
'threshold_exceeded': self.config.large_entity_threshold, 'threshold_exceeded': self.config.large_entity_threshold,
} }
attributes_list = [
attributes_list = [] {
for key, value in attributes_dict.items(): "name": key, "value": value, "type": "large_entity_info",
attributes_list.append({ "provider": provider_name, "confidence": 0.9, "metadata": {}
"name": key, } for key, value in attributes_dict.items()
"value": value, ]
"type": "large_entity_info",
"provider": provider_name, description = f'Large entity created due to {len(members)} relationships from {provider_name}'
"confidence": 0.9,
"metadata": {}
})
description = f'Large entity created due to {len(eligible_targets)} relationships from {provider_name}'
self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description) self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description)
# Add a representative edge from the source to the large entity
if provider_result.relationships: if provider_result.relationships:
# Use the first eligible relationship for the large entity connection rep_rel = provider_result.relationships[0]
eligible_rels = [rel for rel in provider_result.relationships if rel.target_node in eligible_targets] self.graph.add_edge(source, entity_id, rep_rel.relationship_type, 0.9, provider_name,
if eligible_rels: {'large_entity_info': f'Contains {len(members)} {node_type}s'})
rel_type = eligible_rels[0].relationship_type
self.graph.add_edge(source, entity_id, rel_type, 0.9, provider_name, # Create edges from the large entity to shared non-member nodes (e.g., CAs, ISPs)
{'large_entity_info': f'Contains {len(eligible_targets)} {node_type}s'}) processed_targets = set()
for rel in provider_result.relationships:
self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(eligible_targets)} targets from {provider_name}") if rel.source_node in members and rel.target_node not in members:
if rel.target_node not in processed_targets:
return set(eligible_targets) self.graph.add_edge(
entity_id, rel.target_node, rel.relationship_type, rel.confidence,
rel.provider, rel.raw_data
)
processed_targets.add(rel.target_node)
self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(members)} targets from {provider_name}")
return members
def stop_scan(self) -> bool: def stop_scan(self) -> bool:
"""Request immediate scan termination with proper cleanup.""" """Request immediate scan termination with proper cleanup."""
@ -1077,57 +997,50 @@ class Scanner:
def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool: def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
""" """
Extracts a node from a large entity and re-queues it for scanning. Extracts a node from a large entity, restores ALL of its original connections,
FIXED: Properly handle different node types during extraction. and re-queues it for scanning.
""" """
if not self.graph.graph.has_node(large_entity_id): if not self.graph.graph.has_node(large_entity_id):
return False return False
predecessors = list(self.graph.graph.predecessors(large_entity_id)) # Extract the node from the large entity's internal list
if not predecessors:
return False
source_node_id = predecessors[0]
original_edge_data = self.graph.graph.get_edge_data(source_node_id, large_entity_id)
if not original_edge_data:
return False
success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract) success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
if not success: if not success:
return False return False
# Create relationship from source to extracted node # Restore all incoming and outgoing edges for the extracted node
self.graph.add_edge( # These edges already exist in the graph data model; this ensures they are "activated"
source_id=source_node_id, # for the frontend.
target_id=node_id_to_extract, for u, v, data in self.graph.graph.in_edges(node_id_to_extract, data=True):
relationship_type=original_edge_data.get('relationship_type', 'extracted_from_large_entity'), self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
confidence_score=original_edge_data.get('confidence_score', 0.85), data.get('source_provider'), data.get('raw_data'))
source_provider=original_edge_data.get('source_provider', 'unknown'),
raw_data={'context': f'Extracted from large entity {large_entity_id}'}
)
# FIXED: Only queue for further scanning if it's a domain/IP that can be scanned for u, v, data in self.graph.graph.out_edges(node_id_to_extract, data=True):
self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
data.get('source_provider'), data.get('raw_data'))
# Re-queue the extracted node for further scanning if it is a domain or IP
is_ip = _is_valid_ip(node_id_to_extract) is_ip = _is_valid_ip(node_id_to_extract)
is_domain = _is_valid_domain(node_id_to_extract) is_domain = _is_valid_domain(node_id_to_extract)
# Only queue valid domains and IPs for further processing
# Don't queue CA nodes, ISP nodes, etc. as they can't be scanned
if is_domain or is_ip: if is_domain or is_ip:
large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', []) large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', [])
discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None) discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None)
current_depth = discovery_depth_attr['value'] if discovery_depth_attr else 0 current_depth = discovery_depth_attr['value'] if discovery_depth_attr else 0
eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False) eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
for provider in eligible_providers: for provider in eligible_providers:
provider_name = provider.get_name() # Exclude DNS and correlation providers from re-processing
priority = self._get_priority(provider_name) if provider.get_name() not in ['dns', 'correlation']:
self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth))) provider_name = provider.get_name()
self.total_tasks_ever_enqueued += 1 priority = self._get_priority(provider_name)
self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
self.total_tasks_ever_enqueued += 1
if self.status != ScanStatus.RUNNING: if self.status != ScanStatus.RUNNING:
self.status = ScanStatus.RUNNING self.status = ScanStatus.RUNNING
self._update_session_state() self._update_session_state()
if not self.scan_thread or not self.scan_thread.is_alive(): if not self.scan_thread or not self.scan_thread.is_alive():
self.scan_thread = threading.Thread( self.scan_thread = threading.Thread(
target=self._execute_scan, target=self._execute_scan,
@ -1136,7 +1049,6 @@ class Scanner:
) )
self.scan_thread.start() self.scan_thread.start()
else: else:
# For non-scannable nodes (CA, ISP, etc.), just log that they were extracted
self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}") self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}")
return True return True