diff --git a/core/scanner.py b/core/scanner.py index 2d87af8..bd6c8e9 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -825,81 +825,33 @@ class Scanner: return None def _process_provider_result_unified(self, target: str, provider: BaseProvider, - provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: + provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: """ Process a unified ProviderResult object to update the graph. - FIXED: Ensure CA and ISP relationships are created even when large entities are formed. + Handles large entity creation while ensuring all underlying nodes and edges are + added to the graph data model for a complete dataset. """ provider_name = provider.get_name() discovered_targets = set() + large_entity_members = set() if self._is_stop_requested(): return discovered_targets, False - - # Check if this should be a large entity (only counting domain/IP relationships) - eligible_relationship_count = 0 - for rel in provider_result.relationships: - # Only count relationships that would go into large entities - if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer': - continue # Don't count CA relationships - if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp': - continue # Don't count ISP relationships - if rel.relationship_type.startswith('corr_'): - continue # Don't count correlation relationships - - # Only count domain/IP targets - if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node): - eligible_relationship_count += 1 - - if eligible_relationship_count > self.config.large_entity_threshold: - # Create large entity but ALSO process special relationships - members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth) - - # FIXED: Still process CA, ISP, and correlation relationships directly on the graph - for relationship in provider_result.relationships: - if self._is_stop_requested(): - break - - source_node = relationship.source_node - target_node = relationship.target_node - - # Process special relationship types that should appear directly on graph - should_create_direct_relationship = False - target_type = None - - if provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer': - target_type = NodeType.CA - should_create_direct_relationship = True - elif provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp': - target_type = NodeType.ISP - should_create_direct_relationship = True - elif relationship.relationship_type.startswith('corr_'): - target_type = NodeType.CORRELATION_OBJECT - should_create_direct_relationship = True - - if should_create_direct_relationship: - # Create source and target nodes - source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN - self.graph.add_node(source_node, source_type) - self.graph.add_node(target_node, target_type) - - # Add the relationship edge - self.graph.add_edge( - source_node, target_node, - relationship.relationship_type, - relationship.confidence, - provider_name, - relationship.raw_data - ) - - # Add to discovered targets if it's a valid target for further processing - max_depth_reached = current_depth >= self.max_depth - if not max_depth_reached and (_is_valid_domain(target_node) or _is_valid_ip(target_node)): - discovered_targets.add(target_node) - - return members, True - - # Normal processing (existing logic) when not creating large entity + + # Check if a large entity should be created based on the count of domain/IP relationships + eligible_relationship_count = sum( + 1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node) + ) + + is_large_entity = eligible_relationship_count > self.config.large_entity_threshold + + if is_large_entity: + # Create the large entity node and get the set of its members + large_entity_members = self._create_large_entity_from_provider_result( + target, provider_name, provider_result, current_depth + ) + + # Process ALL relationships to build the complete underlying data model for i, relationship in enumerate(provider_result.relationships): if i % 5 == 0 and self._is_stop_requested(): break @@ -907,10 +859,8 @@ class Scanner: source_node = relationship.source_node target_node = relationship.target_node - # Determine source node type + # Determine node types source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN - - # Determine target node type based on provider and relationship if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp': target_type = NodeType.ISP elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer': @@ -921,132 +871,102 @@ class Scanner: target_type = NodeType.IP else: target_type = NodeType.DOMAIN - - # Add max_depth_reached flag + max_depth_reached = current_depth >= self.max_depth - # Create or update nodes with proper types + # Add all nodes and edges to the graph's data model. + # The frontend will handle the visual re-routing for large entity members. self.graph.add_node(source_node, source_type) self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached}) - - # Add the relationship edge - if self.graph.add_edge( + self.graph.add_edge( source_node, target_node, relationship.relationship_type, relationship.confidence, provider_name, relationship.raw_data - ): - pass # Edge was successfully added + ) - # Add target to discovered nodes for further processing + # Add all discovered domains/IPs to be considered for further processing if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached: discovered_targets.add(target_node) - # Process all attributes (existing logic unchanged) + # Process all attributes and add them to the corresponding nodes attributes_by_node = defaultdict(list) for attribute in provider_result.attributes: attr_dict = { - "name": attribute.name, - "value": attribute.value, - "type": attribute.type, - "provider": attribute.provider, - "confidence": attribute.confidence, - "metadata": attribute.metadata + "name": attribute.name, "value": attribute.value, "type": attribute.type, + "provider": attribute.provider, "confidence": attribute.confidence, "metadata": attribute.metadata } attributes_by_node[attribute.target_node].append(attr_dict) - # Add attributes to existing nodes OR create new nodes if they don't exist for node_id, node_attributes_list in attributes_by_node.items(): if not self.graph.graph.has_node(node_id): node_type = NodeType.IP if _is_valid_ip(node_id) else NodeType.DOMAIN self.graph.add_node(node_id, node_type, attributes=node_attributes_list) else: - node_type_val = self.graph.graph.nodes[node_id].get('type', 'domain') - self.graph.add_node(node_id, NodeType(node_type_val), attributes=node_attributes_list) + existing_attrs = self.graph.graph.nodes[node_id].get('attributes', []) + self.graph.graph.nodes[node_id]['attributes'] = existing_attrs + node_attributes_list - return discovered_targets, False + return discovered_targets, is_large_entity - def _create_large_entity_from_provider_result(self, source: str, provider_name: str, + def _create_large_entity_from_provider_result(self, source: str, provider_name: str, provider_result: ProviderResult, current_depth: int) -> Set[str]: """ - Create a large entity node from a ProviderResult. - FIXED: Only include domain/IP nodes in large entities, exclude CA and other special node types. + Create a large entity node and connect it to the source and any shared + non-member nodes like CAs or ISPs. """ entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}" - - # FIXED: Filter out CA, ISP, and correlation nodes from large entity inclusion - eligible_targets = [] - for rel in provider_result.relationships: - target_node = rel.target_node - - # Skip CA nodes (certificate issuers) - they should appear directly on graph - if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer': - continue - - # Skip ISP nodes - they should appear directly on graph - if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp': - continue - - # Skip correlation objects - they should appear directly on graph - if rel.relationship_type.startswith('corr_'): - continue - - # Only include valid domains and IPs in large entities - if _is_valid_domain(target_node) or _is_valid_ip(target_node): - eligible_targets.append(target_node) - - # If no eligible targets after filtering, don't create large entity - if not eligible_targets: + + members = { + rel.target_node for rel in provider_result.relationships + if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node) + } + + if not members: return set() - - node_type = 'unknown' - if eligible_targets: - if _is_valid_domain(eligible_targets[0]): - node_type = 'domain' - elif _is_valid_ip(eligible_targets[0]): - node_type = 'ip' - - # Create individual nodes for eligible targets - for target in eligible_targets: - target_node_type = NodeType.DOMAIN if node_type == 'domain' else NodeType.IP - self.graph.add_node(target, target_node_type) + + first_member = next(iter(members)) + node_type = 'ip' if _is_valid_ip(first_member) else 'domain' attributes_dict = { - 'count': len(eligible_targets), - 'nodes': eligible_targets, # Only eligible domain/IP nodes + 'count': len(members), + 'nodes': list(members), 'node_type': node_type, 'source_provider': provider_name, 'discovery_depth': current_depth, 'threshold_exceeded': self.config.large_entity_threshold, } - - attributes_list = [] - for key, value in attributes_dict.items(): - attributes_list.append({ - "name": key, - "value": value, - "type": "large_entity_info", - "provider": provider_name, - "confidence": 0.9, - "metadata": {} - }) + attributes_list = [ + { + "name": key, "value": value, "type": "large_entity_info", + "provider": provider_name, "confidence": 0.9, "metadata": {} + } for key, value in attributes_dict.items() + ] + + description = f'Large entity created due to {len(members)} relationships from {provider_name}' - description = f'Large entity created due to {len(eligible_targets)} relationships from {provider_name}' - self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description) - + + # Add a representative edge from the source to the large entity if provider_result.relationships: - # Use the first eligible relationship for the large entity connection - eligible_rels = [rel for rel in provider_result.relationships if rel.target_node in eligible_targets] - if eligible_rels: - rel_type = eligible_rels[0].relationship_type - self.graph.add_edge(source, entity_id, rel_type, 0.9, provider_name, - {'large_entity_info': f'Contains {len(eligible_targets)} {node_type}s'}) - - self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(eligible_targets)} targets from {provider_name}") - - return set(eligible_targets) + rep_rel = provider_result.relationships[0] + self.graph.add_edge(source, entity_id, rep_rel.relationship_type, 0.9, provider_name, + {'large_entity_info': f'Contains {len(members)} {node_type}s'}) + + # Create edges from the large entity to shared non-member nodes (e.g., CAs, ISPs) + processed_targets = set() + for rel in provider_result.relationships: + if rel.source_node in members and rel.target_node not in members: + if rel.target_node not in processed_targets: + self.graph.add_edge( + entity_id, rel.target_node, rel.relationship_type, rel.confidence, + rel.provider, rel.raw_data + ) + processed_targets.add(rel.target_node) + + self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(members)} targets from {provider_name}") + + return members def stop_scan(self) -> bool: """Request immediate scan termination with proper cleanup.""" @@ -1077,57 +997,50 @@ class Scanner: def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool: """ - Extracts a node from a large entity and re-queues it for scanning. - FIXED: Properly handle different node types during extraction. + Extracts a node from a large entity, restores ALL of its original connections, + and re-queues it for scanning. """ if not self.graph.graph.has_node(large_entity_id): return False - predecessors = list(self.graph.graph.predecessors(large_entity_id)) - if not predecessors: - return False - source_node_id = predecessors[0] - - original_edge_data = self.graph.graph.get_edge_data(source_node_id, large_entity_id) - if not original_edge_data: - return False - + # Extract the node from the large entity's internal list success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract) if not success: return False - # Create relationship from source to extracted node - self.graph.add_edge( - source_id=source_node_id, - target_id=node_id_to_extract, - relationship_type=original_edge_data.get('relationship_type', 'extracted_from_large_entity'), - confidence_score=original_edge_data.get('confidence_score', 0.85), - source_provider=original_edge_data.get('source_provider', 'unknown'), - raw_data={'context': f'Extracted from large entity {large_entity_id}'} - ) + # Restore all incoming and outgoing edges for the extracted node + # These edges already exist in the graph data model; this ensures they are "activated" + # for the frontend. + for u, v, data in self.graph.graph.in_edges(node_id_to_extract, data=True): + self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'), + data.get('source_provider'), data.get('raw_data')) - # FIXED: Only queue for further scanning if it's a domain/IP that can be scanned + for u, v, data in self.graph.graph.out_edges(node_id_to_extract, data=True): + self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'), + data.get('source_provider'), data.get('raw_data')) + + # Re-queue the extracted node for further scanning if it is a domain or IP is_ip = _is_valid_ip(node_id_to_extract) is_domain = _is_valid_domain(node_id_to_extract) - - # Only queue valid domains and IPs for further processing - # Don't queue CA nodes, ISP nodes, etc. as they can't be scanned + if is_domain or is_ip: large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', []) discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None) current_depth = discovery_depth_attr['value'] if discovery_depth_attr else 0 - + eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False) for provider in eligible_providers: - provider_name = provider.get_name() - priority = self._get_priority(provider_name) - self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth))) - self.total_tasks_ever_enqueued += 1 + # Exclude DNS and correlation providers from re-processing + if provider.get_name() not in ['dns', 'correlation']: + provider_name = provider.get_name() + priority = self._get_priority(provider_name) + self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth))) + self.total_tasks_ever_enqueued += 1 if self.status != ScanStatus.RUNNING: self.status = ScanStatus.RUNNING self._update_session_state() - + if not self.scan_thread or not self.scan_thread.is_alive(): self.scan_thread = threading.Thread( target=self._execute_scan, @@ -1136,7 +1049,6 @@ class Scanner: ) self.scan_thread.start() else: - # For non-scannable nodes (CA, ISP, etc.), just log that they were extracted self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}") return True