diff --git a/core/scanner.py b/core/scanner.py index 236b7ca..9b11729 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -824,18 +824,34 @@ class Scanner: return None def _create_large_entity_from_result(self, source_node: str, provider_name: str, - provider_result: ProviderResult, depth: int) -> Tuple[str, Set[str]]: + provider_result: ProviderResult, depth: int) -> Tuple[str, Set[str]]: """ - Creates a large entity node, tags all member nodes, and returns its ID and members. + Creates a large entity node, tags all member nodes, and stores original relationships. + FIXED: Now stores original relationships for later restoration during extraction. """ members = {rel.target_node for rel in provider_result.relationships - if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)} + if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)} if not members: return "", set() large_entity_id = f"le_{provider_name}_{source_node}" + # FIXED: Store original relationships for each member + member_relationships = {} + for rel in provider_result.relationships: + if rel.target_node in members: + if rel.target_node not in member_relationships: + member_relationships[rel.target_node] = [] + member_relationships[rel.target_node].append({ + 'source_node': rel.source_node, + 'target_node': rel.target_node, + 'relationship_type': rel.relationship_type, + 'confidence': rel.confidence, + 'provider': rel.provider, + 'raw_data': rel.raw_data + }) + self.graph.add_node( node_id=large_entity_id, node_type=NodeType.LARGE_ENTITY, @@ -843,7 +859,8 @@ class Scanner: {"name": "count", "value": len(members), "type": "statistic"}, {"name": "source_provider", "value": provider_name, "type": "metadata"}, {"name": "discovery_depth", "value": depth, "type": "metadata"}, - {"name": "nodes", "value": list(members), "type": "metadata"} + {"name": "nodes", "value": list(members), "type": "metadata"}, + {"name": "original_relationships", "value": member_relationships, "type": "metadata"} # FIXED: Store original relationships ], description=f"A collection of {len(members)} nodes discovered from {source_node} via {provider_name}." ) @@ -860,7 +877,8 @@ class Scanner: def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool: """ - FIXED: Extract a node from a large entity with proper backend updates and edge re-routing. + Removes a node from a large entity and restores its original relationships. + FIXED: Now restores original relationships to make the node reachable. """ if not self.graph.graph.has_node(node_id): return False @@ -871,51 +889,45 @@ class Scanner: if metadata.get('large_entity_id') != large_entity_id: return False - # FIXED: Update the large entity's attributes to remove the extracted node + # Remove the large entity tag + del metadata['large_entity_id'] + self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata) + + # FIXED: Restore original relationships if they exist if self.graph.graph.has_node(large_entity_id): - le_node_data = self.graph.graph.nodes[large_entity_id] - le_attributes = le_node_data.get('attributes', []) + le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', []) + original_relationships_attr = next((a for a in le_attrs if a['name'] == 'original_relationships'), None) - # Update the 'nodes' attribute to remove extracted node - nodes_attr = next((attr for attr in le_attributes if attr['name'] == 'nodes'), None) - if nodes_attr and isinstance(nodes_attr['value'], list): - if node_id in nodes_attr['value']: + if original_relationships_attr and node_id in original_relationships_attr['value']: + # Restore all original relationships for this node + for rel_data in original_relationships_attr['value'][node_id]: + self.graph.add_edge( + source_id=rel_data['source_node'], + target_id=rel_data['target_node'], + relationship_type=rel_data['relationship_type'], + confidence_score=rel_data['confidence'], + source_provider=rel_data['provider'], + raw_data=rel_data['raw_data'] + ) + + # Ensure both nodes exist in the graph + source_type = NodeType.IP if _is_valid_ip(rel_data['source_node']) else NodeType.DOMAIN + target_type = NodeType.IP if _is_valid_ip(rel_data['target_node']) else NodeType.DOMAIN + self.graph.add_node(rel_data['source_node'], source_type) + self.graph.add_node(rel_data['target_node'], target_type) + + # Update the large entity to remove this node from its list + nodes_attr = next((a for a in le_attrs if a['name'] == 'nodes'), None) + if nodes_attr and node_id in nodes_attr['value']: nodes_attr['value'].remove(node_id) - - # Update the 'count' attribute - count_attr = next((attr for attr in le_attributes if attr['name'] == 'count'), None) - if count_attr and isinstance(count_attr['value'], (int, float)): - count_attr['value'] = max(0, count_attr['value'] - 1) - - # Update the large entity node - self.graph.add_node( - large_entity_id, - NodeType.LARGE_ENTITY, - attributes=le_attributes, - description=le_node_data.get('description', ''), - metadata=le_node_data.get('metadata', {}) - ) - - # Remove the large entity tag from extracted node - updated_metadata = metadata.copy() - del updated_metadata['large_entity_id'] - - # Add extraction history for forensic integrity - extraction_record = { - 'extracted_at': datetime.now(timezone.utc).isoformat(), - 'extracted_from': large_entity_id, - 'extraction_method': 'manual' - } - - if 'extraction_history' not in updated_metadata: - updated_metadata['extraction_history'] = [] - updated_metadata['extraction_history'].append(extraction_record) - - # Update the extracted node - self.graph.add_node(node_id, NodeType(node_data['type']), metadata=updated_metadata) - - # FIXED: Re-route edges that were pointing to the large entity - self._reroute_large_entity_edges(large_entity_id, node_id) + + count_attr = next((a for a in le_attrs if a['name'] == 'count'), None) + if count_attr: + count_attr['value'] = max(0, count_attr['value'] - 1) + + # Remove from original relationships tracking + if node_id in original_relationships_attr['value']: + del original_relationships_attr['value'][node_id] # Re-enqueue the node for full processing is_ip = _is_valid_ip(node_id) @@ -923,7 +935,6 @@ class Scanner: for provider in eligible_providers: provider_name = provider.get_name() priority = self._get_priority(provider_name) - # Use current depth of the large entity if available, else 0 depth = 0 if self.graph.graph.has_node(large_entity_id): @@ -935,77 +946,7 @@ class Scanner: self.task_queue.put((time.time(), priority, (provider_name, node_id, depth))) self.total_tasks_ever_enqueued += 1 - # Force session state update for immediate frontend sync - self._update_session_state() - return True - - def _reroute_large_entity_edges(self, large_entity_id: str, extracted_node_id: str) -> None: - """ - FIXED: Re-route edges from large entity to extracted node where appropriate. - """ - if not self.graph.graph.has_node(large_entity_id) or not self.graph.graph.has_node(extracted_node_id): - return - - edges_to_reroute = [] - - # Find edges pointing TO the large entity that should point to the extracted node - for source, target, edge_data in self.graph.graph.in_edges(large_entity_id, data=True): - # Check if this edge was originally meant for the extracted node - raw_data = edge_data.get('raw_data', {}) - - # If the raw data suggests this edge was for the extracted node, re-route it - if (raw_data.get('original_target') == extracted_node_id or - self._should_reroute_edge(edge_data, extracted_node_id)): - edges_to_reroute.append(('in', source, target, edge_data)) - - # Find edges pointing FROM the large entity that should point from the extracted node - for source, target, edge_data in self.graph.graph.out_edges(large_entity_id, data=True): - raw_data = edge_data.get('raw_data', {}) - - if (raw_data.get('original_source') == extracted_node_id or - self._should_reroute_edge(edge_data, extracted_node_id)): - edges_to_reroute.append(('out', source, target, edge_data)) - - # Re-route the edges - for direction, source, target, edge_data in edges_to_reroute: - # Remove old edge - self.graph.graph.remove_edge(source, target) - - # Add new edge with extracted node - if direction == 'in': - new_target = extracted_node_id - new_source = source - else: # direction == 'out' - new_source = extracted_node_id - new_target = target - - # Add the re-routed edge - self.graph.add_edge( - source_id=new_source, - target_id=new_target, - relationship_type=edge_data.get('relationship_type', 'unknown'), - confidence_score=edge_data.get('confidence_score', 0.5), - source_provider=edge_data.get('source_provider', 'rerouted'), - raw_data=dict(edge_data.get('raw_data', {}), **{'rerouted_from_large_entity': large_entity_id}) - ) - - def _should_reroute_edge(self, edge_data: dict, extracted_node_id: str) -> bool: - """ - Determine if an edge should be re-routed to an extracted node. - This is a heuristic-based approach since we don't store original targets. - """ - relationship_type = edge_data.get('relationship_type', '') - - # For now, re-route DNS and certificate-based relationships - # These are likely to be node-specific rather than entity-wide - reroutable_types = [ - 'dns_a_record', 'dns_aaaa_record', 'dns_cname_record', - 'dns_mx_record', 'dns_ptr_record', - 'crtsh_san_certificate', 'crtsh_cert_issuer' - ] - - return any(rtype in relationship_type for rtype in reroutable_types) def _process_provider_result_unified(self, target: str, provider: BaseProvider, provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: