work on large entity extraction
This commit is contained in:
157
core/scanner.py
157
core/scanner.py
@@ -860,7 +860,7 @@ class Scanner:
|
||||
|
||||
def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool:
|
||||
"""
|
||||
Removes a node from a large entity, allowing it to be processed normally.
|
||||
FIXED: Extract a node from a large entity with proper backend updates and edge re-routing.
|
||||
"""
|
||||
if not self.graph.graph.has_node(node_id):
|
||||
return False
|
||||
@@ -868,31 +868,144 @@ class Scanner:
|
||||
node_data = self.graph.graph.nodes[node_id]
|
||||
metadata = node_data.get('metadata', {})
|
||||
|
||||
if metadata.get('large_entity_id') == large_entity_id:
|
||||
# Remove the large entity tag
|
||||
del metadata['large_entity_id']
|
||||
self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata)
|
||||
if metadata.get('large_entity_id') != large_entity_id:
|
||||
return False
|
||||
|
||||
# FIXED: Update the large entity's attributes to remove the extracted node
|
||||
if self.graph.graph.has_node(large_entity_id):
|
||||
le_node_data = self.graph.graph.nodes[large_entity_id]
|
||||
le_attributes = le_node_data.get('attributes', [])
|
||||
|
||||
# Re-enqueue the node for full processing
|
||||
is_ip = _is_valid_ip(node_id)
|
||||
eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
|
||||
for provider in eligible_providers:
|
||||
provider_name = provider.get_name()
|
||||
priority = self._get_priority(provider_name)
|
||||
# Use current depth of the large entity if available, else 0
|
||||
depth = 0
|
||||
if self.graph.graph.has_node(large_entity_id):
|
||||
le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
|
||||
depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
|
||||
if depth_attr:
|
||||
depth = depth_attr['value']
|
||||
# Update the 'nodes' attribute to remove extracted node
|
||||
nodes_attr = next((attr for attr in le_attributes if attr['name'] == 'nodes'), None)
|
||||
if nodes_attr and isinstance(nodes_attr['value'], list):
|
||||
if node_id in nodes_attr['value']:
|
||||
nodes_attr['value'].remove(node_id)
|
||||
|
||||
# Update the 'count' attribute
|
||||
count_attr = next((attr for attr in le_attributes if attr['name'] == 'count'), None)
|
||||
if count_attr and isinstance(count_attr['value'], (int, float)):
|
||||
count_attr['value'] = max(0, count_attr['value'] - 1)
|
||||
|
||||
# Update the large entity node
|
||||
self.graph.add_node(
|
||||
large_entity_id,
|
||||
NodeType.LARGE_ENTITY,
|
||||
attributes=le_attributes,
|
||||
description=le_node_data.get('description', ''),
|
||||
metadata=le_node_data.get('metadata', {})
|
||||
)
|
||||
|
||||
# Remove the large entity tag from extracted node
|
||||
updated_metadata = metadata.copy()
|
||||
del updated_metadata['large_entity_id']
|
||||
|
||||
# Add extraction history for forensic integrity
|
||||
extraction_record = {
|
||||
'extracted_at': datetime.now(timezone.utc).isoformat(),
|
||||
'extracted_from': large_entity_id,
|
||||
'extraction_method': 'manual'
|
||||
}
|
||||
|
||||
if 'extraction_history' not in updated_metadata:
|
||||
updated_metadata['extraction_history'] = []
|
||||
updated_metadata['extraction_history'].append(extraction_record)
|
||||
|
||||
# Update the extracted node
|
||||
self.graph.add_node(node_id, NodeType(node_data['type']), metadata=updated_metadata)
|
||||
|
||||
# FIXED: Re-route edges that were pointing to the large entity
|
||||
self._reroute_large_entity_edges(large_entity_id, node_id)
|
||||
|
||||
# Re-enqueue the node for full processing
|
||||
is_ip = _is_valid_ip(node_id)
|
||||
eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
|
||||
for provider in eligible_providers:
|
||||
provider_name = provider.get_name()
|
||||
priority = self._get_priority(provider_name)
|
||||
|
||||
# Use current depth of the large entity if available, else 0
|
||||
depth = 0
|
||||
if self.graph.graph.has_node(large_entity_id):
|
||||
le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
|
||||
depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
|
||||
if depth_attr:
|
||||
depth = depth_attr['value']
|
||||
|
||||
self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
|
||||
self.total_tasks_ever_enqueued += 1
|
||||
self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
|
||||
self.total_tasks_ever_enqueued += 1
|
||||
|
||||
# Force session state update for immediate frontend sync
|
||||
self._update_session_state()
|
||||
|
||||
return True
|
||||
|
||||
def _reroute_large_entity_edges(self, large_entity_id: str, extracted_node_id: str) -> None:
|
||||
"""
|
||||
FIXED: Re-route edges from large entity to extracted node where appropriate.
|
||||
"""
|
||||
if not self.graph.graph.has_node(large_entity_id) or not self.graph.graph.has_node(extracted_node_id):
|
||||
return
|
||||
|
||||
edges_to_reroute = []
|
||||
|
||||
# Find edges pointing TO the large entity that should point to the extracted node
|
||||
for source, target, edge_data in self.graph.graph.in_edges(large_entity_id, data=True):
|
||||
# Check if this edge was originally meant for the extracted node
|
||||
raw_data = edge_data.get('raw_data', {})
|
||||
|
||||
return True
|
||||
# If the raw data suggests this edge was for the extracted node, re-route it
|
||||
if (raw_data.get('original_target') == extracted_node_id or
|
||||
self._should_reroute_edge(edge_data, extracted_node_id)):
|
||||
edges_to_reroute.append(('in', source, target, edge_data))
|
||||
|
||||
# Find edges pointing FROM the large entity that should point from the extracted node
|
||||
for source, target, edge_data in self.graph.graph.out_edges(large_entity_id, data=True):
|
||||
raw_data = edge_data.get('raw_data', {})
|
||||
|
||||
return False
|
||||
if (raw_data.get('original_source') == extracted_node_id or
|
||||
self._should_reroute_edge(edge_data, extracted_node_id)):
|
||||
edges_to_reroute.append(('out', source, target, edge_data))
|
||||
|
||||
# Re-route the edges
|
||||
for direction, source, target, edge_data in edges_to_reroute:
|
||||
# Remove old edge
|
||||
self.graph.graph.remove_edge(source, target)
|
||||
|
||||
# Add new edge with extracted node
|
||||
if direction == 'in':
|
||||
new_target = extracted_node_id
|
||||
new_source = source
|
||||
else: # direction == 'out'
|
||||
new_source = extracted_node_id
|
||||
new_target = target
|
||||
|
||||
# Add the re-routed edge
|
||||
self.graph.add_edge(
|
||||
source_id=new_source,
|
||||
target_id=new_target,
|
||||
relationship_type=edge_data.get('relationship_type', 'unknown'),
|
||||
confidence_score=edge_data.get('confidence_score', 0.5),
|
||||
source_provider=edge_data.get('source_provider', 'rerouted'),
|
||||
raw_data=dict(edge_data.get('raw_data', {}), **{'rerouted_from_large_entity': large_entity_id})
|
||||
)
|
||||
|
||||
def _should_reroute_edge(self, edge_data: dict, extracted_node_id: str) -> bool:
|
||||
"""
|
||||
Determine if an edge should be re-routed to an extracted node.
|
||||
This is a heuristic-based approach since we don't store original targets.
|
||||
"""
|
||||
relationship_type = edge_data.get('relationship_type', '')
|
||||
|
||||
# For now, re-route DNS and certificate-based relationships
|
||||
# These are likely to be node-specific rather than entity-wide
|
||||
reroutable_types = [
|
||||
'dns_a_record', 'dns_aaaa_record', 'dns_cname_record',
|
||||
'dns_mx_record', 'dns_ptr_record',
|
||||
'crtsh_san_certificate', 'crtsh_cert_issuer'
|
||||
]
|
||||
|
||||
return any(rtype in relationship_type for rtype in reroutable_types)
|
||||
|
||||
def _process_provider_result_unified(self, target: str, provider: BaseProvider,
|
||||
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
|
||||
|
||||
Reference in New Issue
Block a user