attempt fix large entity
This commit is contained in:
		
							parent
							
								
									95cebbf935
								
							
						
					
					
						commit
						1558731c1c
					
				
							
								
								
									
										252
									
								
								core/scanner.py
									
									
									
									
									
								
							
							
						
						
									
										252
									
								
								core/scanner.py
									
									
									
									
									
								
							@ -825,81 +825,33 @@ class Scanner:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
    def _process_provider_result_unified(self, target: str, provider: BaseProvider,
 | 
			
		||||
                                    provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
 | 
			
		||||
                                        provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
 | 
			
		||||
        """
 | 
			
		||||
        Process a unified ProviderResult object to update the graph.
 | 
			
		||||
        FIXED: Ensure CA and ISP relationships are created even when large entities are formed.
 | 
			
		||||
        Handles large entity creation while ensuring all underlying nodes and edges are
 | 
			
		||||
        added to the graph data model for a complete dataset.
 | 
			
		||||
        """
 | 
			
		||||
        provider_name = provider.get_name()
 | 
			
		||||
        discovered_targets = set()
 | 
			
		||||
        large_entity_members = set()
 | 
			
		||||
 | 
			
		||||
        if self._is_stop_requested():
 | 
			
		||||
            return discovered_targets, False
 | 
			
		||||
 | 
			
		||||
        # Check if this should be a large entity (only counting domain/IP relationships)
 | 
			
		||||
        eligible_relationship_count = 0
 | 
			
		||||
        for rel in provider_result.relationships:
 | 
			
		||||
            # Only count relationships that would go into large entities
 | 
			
		||||
            if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
 | 
			
		||||
                continue  # Don't count CA relationships
 | 
			
		||||
            if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
 | 
			
		||||
                continue  # Don't count ISP relationships
 | 
			
		||||
            if rel.relationship_type.startswith('corr_'):
 | 
			
		||||
                continue  # Don't count correlation relationships
 | 
			
		||||
        # Check if a large entity should be created based on the count of domain/IP relationships
 | 
			
		||||
        eligible_relationship_count = sum(
 | 
			
		||||
            1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
            # Only count domain/IP targets
 | 
			
		||||
            if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node):
 | 
			
		||||
                eligible_relationship_count += 1
 | 
			
		||||
        is_large_entity = eligible_relationship_count > self.config.large_entity_threshold
 | 
			
		||||
 | 
			
		||||
        if eligible_relationship_count > self.config.large_entity_threshold:
 | 
			
		||||
            # Create large entity but ALSO process special relationships
 | 
			
		||||
            members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth)
 | 
			
		||||
        if is_large_entity:
 | 
			
		||||
            # Create the large entity node and get the set of its members
 | 
			
		||||
            large_entity_members = self._create_large_entity_from_provider_result(
 | 
			
		||||
                target, provider_name, provider_result, current_depth
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # FIXED: Still process CA, ISP, and correlation relationships directly on the graph
 | 
			
		||||
            for relationship in provider_result.relationships:
 | 
			
		||||
                if self._is_stop_requested():
 | 
			
		||||
                    break
 | 
			
		||||
                    
 | 
			
		||||
                source_node = relationship.source_node
 | 
			
		||||
                target_node = relationship.target_node
 | 
			
		||||
                
 | 
			
		||||
                # Process special relationship types that should appear directly on graph
 | 
			
		||||
                should_create_direct_relationship = False
 | 
			
		||||
                target_type = None
 | 
			
		||||
                
 | 
			
		||||
                if provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
 | 
			
		||||
                    target_type = NodeType.CA
 | 
			
		||||
                    should_create_direct_relationship = True
 | 
			
		||||
                elif provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
 | 
			
		||||
                    target_type = NodeType.ISP
 | 
			
		||||
                    should_create_direct_relationship = True
 | 
			
		||||
                elif relationship.relationship_type.startswith('corr_'):
 | 
			
		||||
                    target_type = NodeType.CORRELATION_OBJECT
 | 
			
		||||
                    should_create_direct_relationship = True
 | 
			
		||||
                
 | 
			
		||||
                if should_create_direct_relationship:
 | 
			
		||||
                    # Create source and target nodes
 | 
			
		||||
                    source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
 | 
			
		||||
                    self.graph.add_node(source_node, source_type)
 | 
			
		||||
                    self.graph.add_node(target_node, target_type)
 | 
			
		||||
                    
 | 
			
		||||
                    # Add the relationship edge
 | 
			
		||||
                    self.graph.add_edge(
 | 
			
		||||
                        source_node, target_node,
 | 
			
		||||
                        relationship.relationship_type,
 | 
			
		||||
                        relationship.confidence,
 | 
			
		||||
                        provider_name,
 | 
			
		||||
                        relationship.raw_data
 | 
			
		||||
                    )
 | 
			
		||||
                    
 | 
			
		||||
                    # Add to discovered targets if it's a valid target for further processing
 | 
			
		||||
                    max_depth_reached = current_depth >= self.max_depth
 | 
			
		||||
                    if not max_depth_reached and (_is_valid_domain(target_node) or _is_valid_ip(target_node)):
 | 
			
		||||
                        discovered_targets.add(target_node)
 | 
			
		||||
            
 | 
			
		||||
            return members, True
 | 
			
		||||
        
 | 
			
		||||
        # Normal processing (existing logic) when not creating large entity
 | 
			
		||||
        # Process ALL relationships to build the complete underlying data model
 | 
			
		||||
        for i, relationship in enumerate(provider_result.relationships):
 | 
			
		||||
            if i % 5 == 0 and self._is_stop_requested():
 | 
			
		||||
                break
 | 
			
		||||
@ -907,10 +859,8 @@ class Scanner:
 | 
			
		||||
            source_node = relationship.source_node
 | 
			
		||||
            target_node = relationship.target_node
 | 
			
		||||
 | 
			
		||||
            # Determine source node type
 | 
			
		||||
            # Determine node types
 | 
			
		||||
            source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
 | 
			
		||||
            
 | 
			
		||||
            # Determine target node type based on provider and relationship
 | 
			
		||||
            if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
 | 
			
		||||
                target_type = NodeType.ISP
 | 
			
		||||
            elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
 | 
			
		||||
@ -922,131 +872,101 @@ class Scanner:
 | 
			
		||||
            else:
 | 
			
		||||
                target_type = NodeType.DOMAIN
 | 
			
		||||
 | 
			
		||||
            # Add max_depth_reached flag
 | 
			
		||||
            max_depth_reached = current_depth >= self.max_depth
 | 
			
		||||
 | 
			
		||||
            # Create or update nodes with proper types
 | 
			
		||||
            # Add all nodes and edges to the graph's data model.
 | 
			
		||||
            # The frontend will handle the visual re-routing for large entity members.
 | 
			
		||||
            self.graph.add_node(source_node, source_type)
 | 
			
		||||
            self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached})
 | 
			
		||||
 | 
			
		||||
            # Add the relationship edge
 | 
			
		||||
            if self.graph.add_edge(
 | 
			
		||||
            self.graph.add_edge(
 | 
			
		||||
                source_node, target_node,
 | 
			
		||||
                relationship.relationship_type,
 | 
			
		||||
                relationship.confidence,
 | 
			
		||||
                provider_name,
 | 
			
		||||
                relationship.raw_data
 | 
			
		||||
            ):
 | 
			
		||||
                pass  # Edge was successfully added
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # Add target to discovered nodes for further processing
 | 
			
		||||
            # Add all discovered domains/IPs to be considered for further processing
 | 
			
		||||
            if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached:
 | 
			
		||||
                discovered_targets.add(target_node)
 | 
			
		||||
 | 
			
		||||
        # Process all attributes (existing logic unchanged)
 | 
			
		||||
        # Process all attributes and add them to the corresponding nodes
 | 
			
		||||
        attributes_by_node = defaultdict(list)
 | 
			
		||||
        for attribute in provider_result.attributes:
 | 
			
		||||
            attr_dict = {
 | 
			
		||||
                "name": attribute.name,
 | 
			
		||||
                "value": attribute.value,
 | 
			
		||||
                "type": attribute.type,
 | 
			
		||||
                "provider": attribute.provider,
 | 
			
		||||
                "confidence": attribute.confidence,
 | 
			
		||||
                "metadata": attribute.metadata
 | 
			
		||||
                "name": attribute.name, "value": attribute.value, "type": attribute.type,
 | 
			
		||||
                "provider": attribute.provider, "confidence": attribute.confidence, "metadata": attribute.metadata
 | 
			
		||||
            }
 | 
			
		||||
            attributes_by_node[attribute.target_node].append(attr_dict)
 | 
			
		||||
 | 
			
		||||
        # Add attributes to existing nodes OR create new nodes if they don't exist
 | 
			
		||||
        for node_id, node_attributes_list in attributes_by_node.items():
 | 
			
		||||
            if not self.graph.graph.has_node(node_id):
 | 
			
		||||
                node_type = NodeType.IP if _is_valid_ip(node_id) else NodeType.DOMAIN
 | 
			
		||||
                self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
 | 
			
		||||
            else:
 | 
			
		||||
                node_type_val = self.graph.graph.nodes[node_id].get('type', 'domain')
 | 
			
		||||
                self.graph.add_node(node_id, NodeType(node_type_val), attributes=node_attributes_list)
 | 
			
		||||
                existing_attrs = self.graph.graph.nodes[node_id].get('attributes', [])
 | 
			
		||||
                self.graph.graph.nodes[node_id]['attributes'] = existing_attrs + node_attributes_list
 | 
			
		||||
 | 
			
		||||
        return discovered_targets, False
 | 
			
		||||
        return discovered_targets, is_large_entity
 | 
			
		||||
 | 
			
		||||
    def _create_large_entity_from_provider_result(self, source: str, provider_name: str,
 | 
			
		||||
                                                provider_result: ProviderResult, current_depth: int) -> Set[str]:
 | 
			
		||||
        """
 | 
			
		||||
        Create a large entity node from a ProviderResult.
 | 
			
		||||
        FIXED: Only include domain/IP nodes in large entities, exclude CA and other special node types.
 | 
			
		||||
        Create a large entity node and connect it to the source and any shared
 | 
			
		||||
        non-member nodes like CAs or ISPs.
 | 
			
		||||
        """
 | 
			
		||||
        entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}"
 | 
			
		||||
 | 
			
		||||
        # FIXED: Filter out CA, ISP, and correlation nodes from large entity inclusion
 | 
			
		||||
        eligible_targets = []
 | 
			
		||||
        for rel in provider_result.relationships:
 | 
			
		||||
            target_node = rel.target_node
 | 
			
		||||
        members = {
 | 
			
		||||
            rel.target_node for rel in provider_result.relationships
 | 
			
		||||
            if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
            # Skip CA nodes (certificate issuers) - they should appear directly on graph
 | 
			
		||||
            if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
 | 
			
		||||
                continue
 | 
			
		||||
                
 | 
			
		||||
            # Skip ISP nodes - they should appear directly on graph  
 | 
			
		||||
            if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
 | 
			
		||||
                continue
 | 
			
		||||
                
 | 
			
		||||
            # Skip correlation objects - they should appear directly on graph
 | 
			
		||||
            if rel.relationship_type.startswith('corr_'):
 | 
			
		||||
                continue
 | 
			
		||||
                
 | 
			
		||||
            # Only include valid domains and IPs in large entities
 | 
			
		||||
            if _is_valid_domain(target_node) or _is_valid_ip(target_node):
 | 
			
		||||
                eligible_targets.append(target_node)
 | 
			
		||||
        
 | 
			
		||||
        # If no eligible targets after filtering, don't create large entity
 | 
			
		||||
        if not eligible_targets:
 | 
			
		||||
        if not members:
 | 
			
		||||
            return set()
 | 
			
		||||
 | 
			
		||||
        node_type = 'unknown'
 | 
			
		||||
        if eligible_targets:
 | 
			
		||||
            if _is_valid_domain(eligible_targets[0]):
 | 
			
		||||
                node_type = 'domain'
 | 
			
		||||
            elif _is_valid_ip(eligible_targets[0]):
 | 
			
		||||
                node_type = 'ip'
 | 
			
		||||
        
 | 
			
		||||
        # Create individual nodes for eligible targets
 | 
			
		||||
        for target in eligible_targets:
 | 
			
		||||
            target_node_type = NodeType.DOMAIN if node_type == 'domain' else NodeType.IP
 | 
			
		||||
            self.graph.add_node(target, target_node_type)
 | 
			
		||||
        first_member = next(iter(members))
 | 
			
		||||
        node_type = 'ip' if _is_valid_ip(first_member) else 'domain'
 | 
			
		||||
 | 
			
		||||
        attributes_dict = {
 | 
			
		||||
            'count': len(eligible_targets),
 | 
			
		||||
            'nodes': eligible_targets,  # Only eligible domain/IP nodes
 | 
			
		||||
            'count': len(members),
 | 
			
		||||
            'nodes': list(members),
 | 
			
		||||
            'node_type': node_type,
 | 
			
		||||
            'source_provider': provider_name,
 | 
			
		||||
            'discovery_depth': current_depth,
 | 
			
		||||
            'threshold_exceeded': self.config.large_entity_threshold,
 | 
			
		||||
        }
 | 
			
		||||
        attributes_list = [
 | 
			
		||||
            {
 | 
			
		||||
                "name": key, "value": value, "type": "large_entity_info",
 | 
			
		||||
                "provider": provider_name, "confidence": 0.9, "metadata": {}
 | 
			
		||||
            } for key, value in attributes_dict.items()
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
        attributes_list = []
 | 
			
		||||
        for key, value in attributes_dict.items():
 | 
			
		||||
            attributes_list.append({
 | 
			
		||||
                "name": key,
 | 
			
		||||
                "value": value,
 | 
			
		||||
                "type": "large_entity_info",
 | 
			
		||||
                "provider": provider_name,
 | 
			
		||||
                "confidence": 0.9,
 | 
			
		||||
                "metadata": {}
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        description = f'Large entity created due to {len(eligible_targets)} relationships from {provider_name}'
 | 
			
		||||
        description = f'Large entity created due to {len(members)} relationships from {provider_name}'
 | 
			
		||||
 | 
			
		||||
        self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description)
 | 
			
		||||
 | 
			
		||||
        # Add a representative edge from the source to the large entity
 | 
			
		||||
        if provider_result.relationships:
 | 
			
		||||
            # Use the first eligible relationship for the large entity connection
 | 
			
		||||
            eligible_rels = [rel for rel in provider_result.relationships if rel.target_node in eligible_targets]
 | 
			
		||||
            if eligible_rels:
 | 
			
		||||
                rel_type = eligible_rels[0].relationship_type
 | 
			
		||||
                self.graph.add_edge(source, entity_id, rel_type, 0.9, provider_name, 
 | 
			
		||||
                                {'large_entity_info': f'Contains {len(eligible_targets)} {node_type}s'})
 | 
			
		||||
            rep_rel = provider_result.relationships[0]
 | 
			
		||||
            self.graph.add_edge(source, entity_id, rep_rel.relationship_type, 0.9, provider_name,
 | 
			
		||||
                                {'large_entity_info': f'Contains {len(members)} {node_type}s'})
 | 
			
		||||
 | 
			
		||||
        self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(eligible_targets)} targets from {provider_name}")
 | 
			
		||||
        # Create edges from the large entity to shared non-member nodes (e.g., CAs, ISPs)
 | 
			
		||||
        processed_targets = set()
 | 
			
		||||
        for rel in provider_result.relationships:
 | 
			
		||||
            if rel.source_node in members and rel.target_node not in members:
 | 
			
		||||
                if rel.target_node not in processed_targets:
 | 
			
		||||
                    self.graph.add_edge(
 | 
			
		||||
                        entity_id, rel.target_node, rel.relationship_type, rel.confidence,
 | 
			
		||||
                        rel.provider, rel.raw_data
 | 
			
		||||
                    )
 | 
			
		||||
                    processed_targets.add(rel.target_node)
 | 
			
		||||
 | 
			
		||||
        return set(eligible_targets)
 | 
			
		||||
        self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(members)} targets from {provider_name}")
 | 
			
		||||
 | 
			
		||||
        return members
 | 
			
		||||
 | 
			
		||||
    def stop_scan(self) -> bool:
 | 
			
		||||
        """Request immediate scan termination with proper cleanup."""
 | 
			
		||||
@ -1077,41 +997,32 @@ class Scanner:
 | 
			
		||||
 | 
			
		||||
    def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Extracts a node from a large entity and re-queues it for scanning.
 | 
			
		||||
        FIXED: Properly handle different node types during extraction.
 | 
			
		||||
        Extracts a node from a large entity, restores ALL of its original connections,
 | 
			
		||||
        and re-queues it for scanning.
 | 
			
		||||
        """
 | 
			
		||||
        if not self.graph.graph.has_node(large_entity_id):
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        predecessors = list(self.graph.graph.predecessors(large_entity_id))
 | 
			
		||||
        if not predecessors:
 | 
			
		||||
            return False
 | 
			
		||||
        source_node_id = predecessors[0]
 | 
			
		||||
        
 | 
			
		||||
        original_edge_data = self.graph.graph.get_edge_data(source_node_id, large_entity_id)
 | 
			
		||||
        if not original_edge_data:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        # Extract the node from the large entity's internal list
 | 
			
		||||
        success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
 | 
			
		||||
        if not success:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        # Create relationship from source to extracted node
 | 
			
		||||
        self.graph.add_edge(
 | 
			
		||||
            source_id=source_node_id,
 | 
			
		||||
            target_id=node_id_to_extract,
 | 
			
		||||
            relationship_type=original_edge_data.get('relationship_type', 'extracted_from_large_entity'),
 | 
			
		||||
            confidence_score=original_edge_data.get('confidence_score', 0.85),
 | 
			
		||||
            source_provider=original_edge_data.get('source_provider', 'unknown'),
 | 
			
		||||
            raw_data={'context': f'Extracted from large entity {large_entity_id}'}
 | 
			
		||||
        )
 | 
			
		||||
        # Restore all incoming and outgoing edges for the extracted node
 | 
			
		||||
        # These edges already exist in the graph data model; this ensures they are "activated"
 | 
			
		||||
        # for the frontend.
 | 
			
		||||
        for u, v, data in self.graph.graph.in_edges(node_id_to_extract, data=True):
 | 
			
		||||
            self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
 | 
			
		||||
                                data.get('source_provider'), data.get('raw_data'))
 | 
			
		||||
 | 
			
		||||
        # FIXED: Only queue for further scanning if it's a domain/IP that can be scanned
 | 
			
		||||
        for u, v, data in self.graph.graph.out_edges(node_id_to_extract, data=True):
 | 
			
		||||
            self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
 | 
			
		||||
                                data.get('source_provider'), data.get('raw_data'))
 | 
			
		||||
 | 
			
		||||
        # Re-queue the extracted node for further scanning if it is a domain or IP
 | 
			
		||||
        is_ip = _is_valid_ip(node_id_to_extract)
 | 
			
		||||
        is_domain = _is_valid_domain(node_id_to_extract)
 | 
			
		||||
 | 
			
		||||
        # Only queue valid domains and IPs for further processing
 | 
			
		||||
        # Don't queue CA nodes, ISP nodes, etc. as they can't be scanned
 | 
			
		||||
        if is_domain or is_ip:
 | 
			
		||||
            large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', [])
 | 
			
		||||
            discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None)
 | 
			
		||||
@ -1119,10 +1030,12 @@ class Scanner:
 | 
			
		||||
 | 
			
		||||
            eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
 | 
			
		||||
            for provider in eligible_providers:
 | 
			
		||||
                provider_name = provider.get_name()
 | 
			
		||||
                priority = self._get_priority(provider_name)
 | 
			
		||||
                self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
 | 
			
		||||
                self.total_tasks_ever_enqueued += 1
 | 
			
		||||
                # Exclude DNS and correlation providers from re-processing
 | 
			
		||||
                if provider.get_name() not in ['dns', 'correlation']:
 | 
			
		||||
                    provider_name = provider.get_name()
 | 
			
		||||
                    priority = self._get_priority(provider_name)
 | 
			
		||||
                    self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
 | 
			
		||||
                    self.total_tasks_ever_enqueued += 1
 | 
			
		||||
 | 
			
		||||
            if self.status != ScanStatus.RUNNING:
 | 
			
		||||
                self.status = ScanStatus.RUNNING
 | 
			
		||||
@ -1136,7 +1049,6 @@ class Scanner:
 | 
			
		||||
                    )
 | 
			
		||||
                    self.scan_thread.start()
 | 
			
		||||
        else:
 | 
			
		||||
            # For non-scannable nodes (CA, ISP, etc.), just log that they were extracted
 | 
			
		||||
            self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}")
 | 
			
		||||
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user