it

2025-09-11 00:00:00 +02:00
parent db2101d814
commit 2d485c5703
7 changed files with 373 additions and 365 deletions
--- a/core/graph_manager.py
+++ b/core/graph_manager.py
@@ -9,6 +9,7 @@ from datetime import datetime
 from typing import Dict, List, Any, Optional, Tuple, Set
 from enum import Enum
 from datetime import timezone
+from collections import defaultdict

 import networkx as nx

@@ -24,14 +25,28 @@ class NodeType(Enum):

 class RelationshipType(Enum):
    """Enumeration of supported relationship types with confidence scores."""
-    SAN_CERTIFICATE = ("san", 0.9)           # Certificate SAN relationships
-    A_RECORD = ("a_record", 0.8)             # A/AAAA record relationships
-    CNAME_RECORD = ("cname", 0.8)            # CNAME relationships
-    PASSIVE_DNS = ("passive_dns", 0.6)       # Passive DNS relationships
-    ASN_MEMBERSHIP = ("asn", 0.7)            # ASN relationships
-    MX_RECORD = ("mx_record", 0.7)           # MX record relationships
-    NS_RECORD = ("ns_record", 0.7)           # NS record relationships
-    
+    SAN_CERTIFICATE = ("san", 0.9)
+    A_RECORD = ("a_record", 0.8)
+    AAAA_RECORD = ("aaaa_record", 0.8)
+    CNAME_RECORD = ("cname", 0.8)
+    MX_RECORD = ("mx_record", 0.7)
+    NS_RECORD = ("ns_record", 0.7)
+    PTR_RECORD = ("ptr_record", 0.8)
+    SOA_RECORD = ("soa_record", 0.7)
+    TXT_RECORD = ("txt_record", 0.7)
+    SRV_RECORD = ("srv_record", 0.7)
+    CAA_RECORD = ("caa_record", 0.7)
+    DNSKEY_RECORD = ("dnskey_record", 0.7)
+    DS_RECORD = ("ds_record", 0.7)
+    RRSIG_RECORD = ("rrsig_record", 0.7)
+    SSHFP_RECORD = ("sshfp_record", 0.7)
+    TLSA_RECORD = ("tlsa_record", 0.7)
+    NAPTR_RECORD = ("naptr_record", 0.7)
+    SPF_RECORD = ("spf_record", 0.7)
+    PASSIVE_DNS = ("passive_dns", 0.6)
+    ASN_MEMBERSHIP = ("asn", 0.7)
+
+
    def __init__(self, relationship_name: str, default_confidence: float):
        self.relationship_name = relationship_name
        self.default_confidence = default_confidence
@@ -42,24 +57,24 @@ class GraphManager:
    Thread-safe graph manager for DNSRecon infrastructure mapping.
    Uses NetworkX for in-memory graph storage with confidence scoring.
    """
-    
+
    def __init__(self):
        """Initialize empty directed graph."""
        self.graph = nx.DiGraph()
        # self.lock = threading.Lock()
        self.creation_time = datetime.now(timezone.utc).isoformat()
        self.last_modified = self.creation_time
-        
-    def add_node(self, node_id: str, node_type: NodeType, 
+
+    def add_node(self, node_id: str, node_type: NodeType,
                 metadata: Optional[Dict[str, Any]] = None) -> bool:
        """
        Add a node to the graph.
-        
+
        Args:
            node_id: Unique identifier for the node
            node_type: Type of the node (Domain, IP, Certificate, ASN)
            metadata: Additional metadata for the node
-            
+
        Returns:
            bool: True if node was added, False if it already exists
        """
@@ -70,33 +85,33 @@ class GraphManager:
                existing_metadata.update(metadata)
                self.graph.nodes[node_id]['metadata'] = existing_metadata
            return False
-        
+
        node_attributes = {
            'type': node_type.value,
            'added_timestamp': datetime.now(timezone.utc).isoformat(),
            'metadata': metadata or {}
        }
-        
+
        self.graph.add_node(node_id, **node_attributes)
        self.last_modified = datetime.now(timezone.utc).isoformat()
        return True
-    
-    def add_edge(self, source_id: str, target_id: str, 
+
+    def add_edge(self, source_id: str, target_id: str,
                 relationship_type: RelationshipType,
                 confidence_score: Optional[float] = None,
                 source_provider: str = "unknown",
                 raw_data: Optional[Dict[str, Any]] = None) -> bool:
        """
        Add an edge between two nodes.
-        
+
        Args:
            source_id: Source node identifier
-            target_id: Target node identifier  
+            target_id: Target node identifier
            relationship_type: Type of relationship
            confidence_score: Custom confidence score (overrides default)
            source_provider: Provider that discovered this relationship
            raw_data: Raw data from provider response
-            
+
        Returns:
            bool: True if edge was added, False if it already exists
        """
@@ -112,14 +127,14 @@ class GraphManager:
            # Update confidence score if new score is higher
            existing_confidence = self.graph.edges[source_id, target_id]['confidence_score']
            new_confidence = confidence_score or relationship_type.default_confidence
-            
+
            if new_confidence > existing_confidence:
                self.graph.edges[source_id, target_id]['confidence_score'] = new_confidence
                self.graph.edges[source_id, target_id]['updated_timestamp'] = datetime.now(timezone.utc).isoformat()
                self.graph.edges[source_id, target_id]['updated_by'] = source_provider
-            
+
            return False
-        
+
        edge_attributes = {
            'relationship_type': relationship_type.relationship_name,
            'confidence_score': confidence_score or relationship_type.default_confidence,
@@ -127,7 +142,7 @@ class GraphManager:
            'discovery_timestamp': datetime.now(timezone.utc).isoformat(),
            'raw_data': raw_data or {}
        }
-        
+
        self.graph.add_edge(source_id, target_id, **edge_attributes)
        self.last_modified = datetime.now(timezone.utc).isoformat()
        return True
@@ -136,19 +151,19 @@ class GraphManager:
        """Get total number of nodes in the graph."""
        #with self.lock:
        return self.graph.number_of_nodes()
-    
+
    def get_edge_count(self) -> int:
        """Get total number of edges in the graph."""
        #with self.lock:
        return self.graph.number_of_edges()
-    
+
    def get_nodes_by_type(self, node_type: NodeType) -> List[str]:
        """
        Get all nodes of a specific type.
-        
+
        Args:
            node_type: Type of nodes to retrieve
-            
+
        Returns:
            List of node identifiers
        """
@@ -157,32 +172,32 @@ class GraphManager:
            node_id for node_id, attributes in self.graph.nodes(data=True)
            if attributes.get('type') == node_type.value
        ]
-    
+
    def get_neighbors(self, node_id: str) -> List[str]:
        """
        Get all neighboring nodes (both incoming and outgoing).
-        
+
        Args:
            node_id: Node identifier
-            
+
        Returns:
            List of neighboring node identifiers
        """
        #with self.lock:
        if not self.graph.has_node(node_id):
            return []
-        
+
        predecessors = list(self.graph.predecessors(node_id))
        successors = list(self.graph.successors(node_id))
        return list(set(predecessors + successors))
-    
+
    def get_high_confidence_edges(self, min_confidence: float = 0.8) -> List[Tuple[str, str, Dict]]:
        """
        Get edges with confidence score above threshold.
-        
+
        Args:
            min_confidence: Minimum confidence threshold
-            
+
        Returns:
            List of tuples (source, target, attributes)
        """
@@ -192,18 +207,49 @@ class GraphManager:
            for source, target, attributes in self.graph.edges(data=True)
            if attributes.get('confidence_score', 0) >= min_confidence
        ]
-    
+
    def get_graph_data(self) -> Dict[str, Any]:
        """
        Export graph data for visualization.
-        
+
        Returns:
            Dictionary containing nodes and edges for frontend visualization
        """
        #with self.lock:
        nodes = []
        edges = []
-        
+
+        # Create a dictionary to hold aggregated data for each node
+        node_details = defaultdict(lambda: defaultdict(list))
+
+        for source, target, attributes in self.graph.edges(data=True):
+            provider = attributes.get('source_provider', 'unknown')
+            raw_data = attributes.get('raw_data', {})
+
+            if provider == 'dns':
+                record_type = raw_data.get('query_type', 'UNKNOWN')
+                value = raw_data.get('value', target)
+                # DNS data is always about the source node of the query
+                node_details[source]['dns_records'].append(f"{record_type}: {value}")
+
+            elif provider == 'crtsh':
+                # Data from crt.sh are domain names found in certificates (SANs)
+                node_details[source]['related_domains_san'].append(target)
+
+            elif provider == 'shodan':
+                # Shodan data is about the IP, which can be either the source or target
+                source_node_type = self.graph.nodes[source].get('type')
+                target_node_type = self.graph.nodes[target].get('type')
+
+                if source_node_type == 'ip':
+                    node_details[source]['shodan'] = raw_data
+                elif target_node_type == 'ip':
+                    node_details[target]['shodan'] = raw_data
+
+            elif provider == 'virustotal':
+                # VirusTotal data is about the source node of the query
+                node_details[source]['virustotal'] = raw_data
+
        # Format nodes for visualization
        for node_id, attributes in self.graph.nodes(data=True):
            node_data = {
@@ -213,7 +259,18 @@ class GraphManager:
                'metadata': attributes.get('metadata', {}),
                'added_timestamp': attributes.get('added_timestamp')
            }
-            
+
+            # Add the aggregated details to the metadata
+            if node_id in node_details:
+                for key, value in node_details[node_id].items():
+                    # Use a set to avoid adding duplicate entries to lists
+                    if key in node_data['metadata'] and isinstance(node_data['metadata'][key], list):
+                        existing_values = set(node_data['metadata'][key])
+                        new_values = [v for v in value if v not in existing_values]
+                        node_data['metadata'][key].extend(new_values)
+                    else:
+                        node_data['metadata'][key] = value
+
            # Color coding by type - now returns color objects for enhanced visualization
            type_colors = {
                'domain': {
@@ -239,18 +296,24 @@ class GraphManager:
                    'border': '#0088cc',
                    'highlight': {'background': '#44ccff', 'border': '#00aaff'},
                    'hover': {'background': '#22bbff', 'border': '#0099dd'}
+                },
+                'large_entity': {
+                    'background': '#ff6b6b',
+                    'border': '#cc3a3a',
+                    'highlight': {'background': '#ff8c8c', 'border': '#ff6b6b'},
+                    'hover': {'background': '#ff7a7a', 'border': '#dd4a4a'}
                }
            }
-            
+
            node_color_config = type_colors.get(attributes.get('type', 'unknown'), type_colors['domain'])
            node_data['color'] = node_color_config
-            
+
            # Pass the has_valid_cert metadata for styling
            if 'metadata' in attributes and 'has_valid_cert' in attributes['metadata']:
                node_data['has_valid_cert'] = attributes['metadata']['has_valid_cert']

            nodes.append(node_data)
-        
+
        # Format edges for visualization
        for source, target, attributes in self.graph.edges(data=True):
            edge_data = {
@@ -261,7 +324,7 @@ class GraphManager:
                'source_provider': attributes.get('source_provider', ''),
                'discovery_timestamp': attributes.get('discovery_timestamp')
            }
-            
+
            # Enhanced edge styling based on confidence
            confidence = attributes.get('confidence_score', 0)
            if confidence >= 0.8:
@@ -275,7 +338,7 @@ class GraphManager:
            elif confidence >= 0.6:
                edge_data['color'] = {
                    'color': '#ff9900',
-                    'highlight': '#ffbb44', 
+                    'highlight': '#ffbb44',
                    'hover': '#ffaa22',
                    'inherit': False
                }
@@ -288,13 +351,13 @@ class GraphManager:
                    'inherit': False
                }
                edge_data['width'] = 2
-            
+
            # Add dashed line for low confidence
            if confidence < 0.6:
                edge_data['dashes'] = [5, 5]
-            
+
            edges.append(edge_data)
-        
+
        return {
            'nodes': nodes,
            'edges': edges,
@@ -305,18 +368,18 @@ class GraphManager:
                'last_modified': self.last_modified
            }
        }
-    
+
    def export_json(self) -> Dict[str, Any]:
        """
        Export complete graph data as JSON for download.
-        
+
        Returns:
            Dictionary containing complete graph data with metadata
        """
        #with self.lock:
        # Get basic graph data
        graph_data = self.get_graph_data()
-        
+
        # Add comprehensive metadata
        export_data = {
            'export_metadata': {
@@ -339,13 +402,13 @@ class GraphManager:
            ],
            'confidence_distribution': self._get_confidence_distribution()
        }
-        
+
        return export_data
-    
+
    def _get_confidence_distribution(self) -> Dict[str, int]:
        """Get distribution of confidence scores."""
        distribution = {'high': 0, 'medium': 0, 'low': 0}
-        
+
        for _, _, attributes in self.graph.edges(data=True):
            confidence = attributes.get('confidence_score', 0)
            if confidence >= 0.8:
@@ -354,13 +417,13 @@ class GraphManager:
                distribution['medium'] += 1
            else:
                distribution['low'] += 1
-        
+
        return distribution
-    
+
    def get_statistics(self) -> Dict[str, Any]:
        """
        Get comprehensive graph statistics.
-        
+
        Returns:
            Dictionary containing various graph metrics
        """
@@ -377,26 +440,26 @@ class GraphManager:
            'confidence_distribution': self._get_confidence_distribution(),
            'provider_distribution': {}
        }
-        
+
        # Node type distribution
        for node_type in NodeType:
            count = len(self.get_nodes_by_type(node_type))
            stats['node_type_distribution'][node_type.value] = count
-        
+
        # Relationship type distribution
        for _, _, attributes in self.graph.edges(data=True):
            rel_type = attributes.get('relationship_type', 'unknown')
            stats['relationship_type_distribution'][rel_type] = \
                stats['relationship_type_distribution'].get(rel_type, 0) + 1
-        
+
        # Provider distribution
        for _, _, attributes in self.graph.edges(data=True):
            provider = attributes.get('source_provider', 'unknown')
            stats['provider_distribution'][provider] = \
                stats['provider_distribution'].get(provider, 0) + 1
-        
+
        return stats
-    
+
    def clear(self) -> None:
        """Clear all nodes and edges from the graph."""
        #with self.lock:
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -8,6 +8,7 @@ import time
 import traceback
 from typing import List, Set, Dict, Any, Optional, Tuple
 from concurrent.futures import ThreadPoolExecutor, as_completed, CancelledError
+from collections import defaultdict

 from core.graph_manager import GraphManager, NodeType, RelationshipType
 from core.logger import get_forensic_logger, new_session
@@ -334,9 +335,7 @@ class Scanner:
        print(f"Querying {len(self.providers)} providers for domain: {domain}")
        discovered_domains = set()
        discovered_ips = set()
-        
-        # Define a threshold for creating a "large entity" node
-        LARGE_ENTITY_THRESHOLD = 50
+        relationships_by_type = defaultdict(list)

        if not self.providers or self.stop_event.is_set():
            return discovered_domains, discovered_ips
@@ -355,35 +354,72 @@ class Scanner:
                    relationships = future.result()
                    print(f"Provider {provider.get_name()} returned {len(relationships)} relationships")

-                    # Check if the number of relationships exceeds the threshold
-                    if len(relationships) > LARGE_ENTITY_THRESHOLD:
-                        # Create a single "large entity" node
-                        large_entity_id = f"large_entity_{provider.get_name()}_{domain}"
-                        self.graph.add_node(large_entity_id, NodeType.LARGE_ENTITY, metadata={'count': len(relationships), 'provider': provider.get_name()})
-                        self.graph.add_edge(domain, large_entity_id, RelationshipType.PASSIVE_DNS, 1.0, provider.get_name(), {})
-                        print(f"Created large entity node for {domain} from {provider.get_name()} with {len(relationships)} relationships")
-                        continue # Skip adding individual nodes
+                    for rel in relationships:
+                        relationships_by_type[rel[2]].append(rel)

-                    for source, target, rel_type, confidence, raw_data in relationships:
-                        if self._is_valid_ip(target):
-                            target_node_type = NodeType.IP
-                            discovered_ips.add(target)
-                        elif self._is_valid_domain(target):
-                            target_node_type = NodeType.DOMAIN
-                            discovered_domains.add(target)
-                        else:
-                            target_node_type = NodeType.ASN if target.startswith('AS') else NodeType.CERTIFICATE
-
-                        self.graph.add_node(source, NodeType.DOMAIN)
-                        self.graph.add_node(target, target_node_type)
-                        if self.graph.add_edge(source, target, rel_type, confidence, provider.get_name(), raw_data):
-                            print(f"Added relationship: {source} -> {target} ({rel_type.relationship_name})")
                except (Exception, CancelledError) as e:
                    print(f"Provider {provider.get_name()} failed for {domain}: {e}")

+        for rel_type, relationships in relationships_by_type.items():
+            if len(relationships) > config.large_entity_threshold and rel_type == RelationshipType.SAN_CERTIFICATE:
+                self._handle_large_entity(domain, relationships, rel_type, provider.get_name())
+            else:
+                for source, target, rel_type, confidence, raw_data in relationships:
+                    # Determine if the target should create a new node
+                    create_node = rel_type in [
+                        RelationshipType.A_RECORD,
+                        RelationshipType.AAAA_RECORD,
+                        RelationshipType.CNAME_RECORD,
+                        RelationshipType.MX_RECORD,
+                        RelationshipType.NS_RECORD,
+                        RelationshipType.PTR_RECORD,
+                        RelationshipType.SAN_CERTIFICATE
+                    ]
+
+                    # Determine if the target should be subject to recursion
+                    recurse = rel_type in [
+                        RelationshipType.A_RECORD,
+                        RelationshipType.AAAA_RECORD,
+                        RelationshipType.CNAME_RECORD,
+                        RelationshipType.MX_RECORD,
+                        RelationshipType.SAN_CERTIFICATE
+                    ]
+
+                    if create_node:
+                        target_node_type = NodeType.IP if self._is_valid_ip(target) else NodeType.DOMAIN
+                        self.graph.add_node(target, target_node_type)
+                        if self.graph.add_edge(source, target, rel_type, confidence, provider.get_name(), raw_data):
+                            print(f"Added relationship: {source} -> {target} ({rel_type.relationship_name})")
+                    else:
+                        # For records that don't create nodes, we still want to log the relationship
+                        self.logger.log_relationship_discovery(
+                            source_node=source,
+                            target_node=target,
+                            relationship_type=rel_type.relationship_name,
+                            confidence_score=confidence,
+                            provider=provider.name,
+                            raw_data=raw_data,
+                            discovery_method=f"dns_{rel_type.name.lower()}_record"
+                        )
+
+                    if recurse:
+                        if self._is_valid_ip(target):
+                            discovered_ips.add(target)
+                        elif self._is_valid_domain(target):
+                            discovered_domains.add(target)
+
        print(f"Domain {domain}: discovered {len(discovered_domains)} domains, {len(discovered_ips)} IPs")
        return discovered_domains, discovered_ips

+    def _handle_large_entity(self, source_domain: str, relationships: list, rel_type: RelationshipType, provider_name: str):
+        """
+        Handles the creation of a large entity node when a threshold is exceeded.
+        """
+        print(f"Large number of {rel_type.name} relationships for {source_domain}. Creating a large entity node.")
+        entity_name = f"Large collection of {rel_type.name} for {source_domain}"
+        self.graph.add_node(entity_name, NodeType.LARGE_ENTITY, metadata={"count": len(relationships)})
+        self.graph.add_edge(source_domain, entity_name, rel_type, 0.9, provider_name, {"info": "Aggregated node"})
+
    def _query_providers_for_ip(self, ip: str) -> None:
        """
        Query all enabled providers for information about an IP address.