correlation engine
This commit is contained in:
		
							parent
							
								
									cbfd40ee98
								
							
						
					
					
						commit
						12f834bb65
					
				@ -33,14 +33,16 @@ class Config:
 | 
			
		||||
        self.rate_limits = {
 | 
			
		||||
            'crtsh': 5,
 | 
			
		||||
            'shodan': 60,
 | 
			
		||||
            'dns': 100
 | 
			
		||||
            'dns': 100,
 | 
			
		||||
            'correlation': 1000 # Set a high limit as it's a local operation
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        # --- Provider Settings ---
 | 
			
		||||
        self.enabled_providers = {
 | 
			
		||||
            'crtsh': True,
 | 
			
		||||
            'dns': True,
 | 
			
		||||
            'shodan': False
 | 
			
		||||
            'shodan': False,
 | 
			
		||||
            'correlation': True # Enable the new provider by default
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        # --- Logging ---
 | 
			
		||||
 | 
			
		||||
@ -40,270 +40,6 @@ class GraphManager:
 | 
			
		||||
        self.graph = nx.DiGraph()
 | 
			
		||||
        self.creation_time = datetime.now(timezone.utc).isoformat()
 | 
			
		||||
        self.last_modified = self.creation_time
 | 
			
		||||
        self.correlation_index = {}
 | 
			
		||||
        # Compile regex for date filtering for efficiency
 | 
			
		||||
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
 | 
			
		||||
        
 | 
			
		||||
        # FIXED: Exclude cert_issuer_name since we already create proper CA relationships
 | 
			
		||||
        self.EXCLUDED_KEYS = [
 | 
			
		||||
            # Certificate metadata that creates noise or has dedicated node types
 | 
			
		||||
            'cert_source',                    # Always 'crtsh' for crtsh provider
 | 
			
		||||
            'cert_common_name',
 | 
			
		||||
            'cert_validity_period_days',      # Numerical, not useful for correlation
 | 
			
		||||
            'cert_issuer_name',              # FIXED: Has dedicated CA nodes, don't correlate
 | 
			
		||||
            #'cert_certificate_id',            # Unique per certificate
 | 
			
		||||
            #'cert_serial_number',            # Unique per certificate
 | 
			
		||||
            'cert_entry_timestamp',          # Timestamp, filtered by date regex anyway
 | 
			
		||||
            'cert_not_before',               # Date, filtered by date regex anyway
 | 
			
		||||
            'cert_not_after',                # Date, filtered by date regex anyway
 | 
			
		||||
            # DNS metadata that creates noise
 | 
			
		||||
            'dns_ttl',                       # TTL values are not meaningful for correlation
 | 
			
		||||
            # Shodan metadata that might create noise
 | 
			
		||||
            'timestamp',                     # Generic timestamp fields
 | 
			
		||||
            'last_update',                   # Generic timestamp fields
 | 
			
		||||
            #'org',                          # Too generic, causes false correlations
 | 
			
		||||
            #'isp',                          # Too generic, causes false correlations
 | 
			
		||||
            # Generic noisy attributes
 | 
			
		||||
            'updated_timestamp',             # Any timestamp field
 | 
			
		||||
            'discovery_timestamp',           # Any timestamp field
 | 
			
		||||
            'query_timestamp',               # Any timestamp field
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    def __getstate__(self):
 | 
			
		||||
        """Prepare GraphManager for pickling, excluding compiled regex."""
 | 
			
		||||
        state = self.__dict__.copy()
 | 
			
		||||
        # Compiled regex patterns are not always picklable
 | 
			
		||||
        if 'date_pattern' in state:
 | 
			
		||||
            del state['date_pattern']
 | 
			
		||||
        return state
 | 
			
		||||
 | 
			
		||||
    def __setstate__(self, state):
 | 
			
		||||
        """Restore GraphManager state and recompile regex."""
 | 
			
		||||
        self.__dict__.update(state)
 | 
			
		||||
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
 | 
			
		||||
 | 
			
		||||
    def process_correlations_for_node(self, node_id: str):
 | 
			
		||||
        """
 | 
			
		||||
        UPDATED: Process correlations for a given node with enhanced tracking.
 | 
			
		||||
        Now properly tracks which attribute/provider created each correlation.
 | 
			
		||||
        """
 | 
			
		||||
        if not self.graph.has_node(node_id):
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        node_attributes = self.graph.nodes[node_id].get('attributes', [])
 | 
			
		||||
        
 | 
			
		||||
        # Process each attribute for potential correlations
 | 
			
		||||
        for attr in node_attributes:
 | 
			
		||||
            attr_name = attr.get('name')
 | 
			
		||||
            attr_value = attr.get('value')
 | 
			
		||||
            attr_provider = attr.get('provider', 'unknown')
 | 
			
		||||
 | 
			
		||||
            # IMPROVED: More comprehensive exclusion logic
 | 
			
		||||
            should_exclude = (
 | 
			
		||||
                # Check against excluded keys (exact match or substring)
 | 
			
		||||
                any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS) or
 | 
			
		||||
                # Invalid value types
 | 
			
		||||
                not isinstance(attr_value, (str, int, float, bool)) or 
 | 
			
		||||
                attr_value is None or
 | 
			
		||||
                # Boolean values are not useful for correlation
 | 
			
		||||
                isinstance(attr_value, bool) or
 | 
			
		||||
                # String values that are too short or are dates
 | 
			
		||||
                (isinstance(attr_value, str) and (
 | 
			
		||||
                    len(attr_value) < 4 or 
 | 
			
		||||
                    self.date_pattern.match(attr_value) or
 | 
			
		||||
                    # Exclude common generic values that create noise
 | 
			
		||||
                    attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']
 | 
			
		||||
                )) or
 | 
			
		||||
                # Numerical values that are likely to be unique identifiers
 | 
			
		||||
                (isinstance(attr_value, (int, float)) and (
 | 
			
		||||
                    attr_value == 0 or  # Zero values are not meaningful
 | 
			
		||||
                    attr_value == 1 or  # One values are too common
 | 
			
		||||
                    abs(attr_value) > 1000000  # Very large numbers are likely IDs
 | 
			
		||||
                ))
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            if should_exclude:
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            # Initialize correlation tracking for this value
 | 
			
		||||
            if attr_value not in self.correlation_index:
 | 
			
		||||
                self.correlation_index[attr_value] = {
 | 
			
		||||
                    'nodes': set(),
 | 
			
		||||
                    'sources': []  # Track which provider/attribute combinations contributed
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            # Add this node and source information
 | 
			
		||||
            self.correlation_index[attr_value]['nodes'].add(node_id)
 | 
			
		||||
            
 | 
			
		||||
            # Track the source of this correlation value
 | 
			
		||||
            source_info = {
 | 
			
		||||
                'node_id': node_id,
 | 
			
		||||
                'provider': attr_provider,
 | 
			
		||||
                'attribute': attr_name,
 | 
			
		||||
                'path': f"{attr_provider}_{attr_name}"
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            # Add source if not already present (avoid duplicates)
 | 
			
		||||
            existing_sources = [s for s in self.correlation_index[attr_value]['sources'] 
 | 
			
		||||
                              if s['node_id'] == node_id and s['path'] == source_info['path']]
 | 
			
		||||
            if not existing_sources:
 | 
			
		||||
                self.correlation_index[attr_value]['sources'].append(source_info)
 | 
			
		||||
 | 
			
		||||
            # Create correlation node if we have multiple nodes with this value
 | 
			
		||||
            if len(self.correlation_index[attr_value]['nodes']) > 1:
 | 
			
		||||
                self._create_enhanced_correlation_node_and_edges(attr_value, self.correlation_index[attr_value])
 | 
			
		||||
 | 
			
		||||
    def _create_enhanced_correlation_node_and_edges(self, value, correlation_data):
 | 
			
		||||
        """
 | 
			
		||||
        UPDATED: Create correlation node and edges with raw provider data (no formatting).
 | 
			
		||||
        """
 | 
			
		||||
        correlation_node_id = f"corr_{hash(str(value)) & 0x7FFFFFFF}"
 | 
			
		||||
        nodes = correlation_data['nodes']
 | 
			
		||||
        sources = correlation_data['sources']
 | 
			
		||||
        
 | 
			
		||||
        # Create or update correlation node
 | 
			
		||||
        if not self.graph.has_node(correlation_node_id):
 | 
			
		||||
            # Use raw provider/attribute data - no formatting
 | 
			
		||||
            provider_counts = {}
 | 
			
		||||
            for source in sources:
 | 
			
		||||
                # Keep original provider and attribute names
 | 
			
		||||
                key = f"{source['provider']}_{source['attribute']}"
 | 
			
		||||
                provider_counts[key] = provider_counts.get(key, 0) + 1
 | 
			
		||||
            
 | 
			
		||||
            # Use the most common provider/attribute as the primary label (raw)
 | 
			
		||||
            primary_source = max(provider_counts.items(), key=lambda x: x[1])[0] if provider_counts else "unknown_correlation"
 | 
			
		||||
            
 | 
			
		||||
            metadata = {
 | 
			
		||||
                'value': value,
 | 
			
		||||
                'correlated_nodes': list(nodes),
 | 
			
		||||
                'sources': sources,
 | 
			
		||||
                'primary_source': primary_source,
 | 
			
		||||
                'correlation_count': len(nodes)
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT, metadata=metadata)
 | 
			
		||||
            #print(f"Created correlation node {correlation_node_id} for value '{value}' with {len(nodes)} nodes")
 | 
			
		||||
 | 
			
		||||
        # Create edges from each node to the correlation node
 | 
			
		||||
        for source in sources:
 | 
			
		||||
            node_id = source['node_id']
 | 
			
		||||
            provider = source['provider']
 | 
			
		||||
            attribute = source['attribute']
 | 
			
		||||
            
 | 
			
		||||
            if self.graph.has_node(node_id) and not self.graph.has_edge(node_id, correlation_node_id):
 | 
			
		||||
                # Format relationship label as "corr_provider_attribute"
 | 
			
		||||
                relationship_label = f"corr_{provider}_{attribute}"
 | 
			
		||||
                
 | 
			
		||||
                self.add_edge(
 | 
			
		||||
                    source_id=node_id,
 | 
			
		||||
                    target_id=correlation_node_id,
 | 
			
		||||
                    relationship_type=relationship_label,
 | 
			
		||||
                    confidence_score=0.9,
 | 
			
		||||
                    source_provider=provider,
 | 
			
		||||
                    raw_data={
 | 
			
		||||
                        'correlation_value': value,
 | 
			
		||||
                        'original_attribute': attribute,
 | 
			
		||||
                        'correlation_type': 'attribute_matching'
 | 
			
		||||
                    }
 | 
			
		||||
                )
 | 
			
		||||
                
 | 
			
		||||
                #print(f"Added correlation edge: {node_id} -> {correlation_node_id} ({relationship_label})")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Check if there's a direct edge between two nodes in either direction.
 | 
			
		||||
        Returns True if node_aâ†'node_b OR node_bâ†'node_a exists.
 | 
			
		||||
        """
 | 
			
		||||
        return (self.graph.has_edge(node_a, node_b) or 
 | 
			
		||||
                self.graph.has_edge(node_b, node_a))
 | 
			
		||||
 | 
			
		||||
    def _correlation_value_matches_existing_node(self, correlation_value: str) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Check if correlation value contains any existing node ID as substring.
 | 
			
		||||
        Returns True if match found (correlation node should NOT be created).
 | 
			
		||||
        """
 | 
			
		||||
        correlation_str = str(correlation_value).lower()
 | 
			
		||||
        
 | 
			
		||||
        # Check against all existing nodes
 | 
			
		||||
        for existing_node_id in self.graph.nodes():
 | 
			
		||||
            if existing_node_id.lower() in correlation_str:
 | 
			
		||||
                return True
 | 
			
		||||
        
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    def _find_correlation_nodes_with_same_pattern(self, node_set: set) -> List[str]:
 | 
			
		||||
        """
 | 
			
		||||
        Find existing correlation nodes that have the exact same pattern of connected nodes.
 | 
			
		||||
        Returns list of correlation node IDs with matching patterns.
 | 
			
		||||
        """
 | 
			
		||||
        correlation_nodes = self.get_nodes_by_type(NodeType.CORRELATION_OBJECT)
 | 
			
		||||
        matching_nodes = []
 | 
			
		||||
        
 | 
			
		||||
        for corr_node_id in correlation_nodes:
 | 
			
		||||
            # Get all nodes connected to this correlation node
 | 
			
		||||
            connected_nodes = set()
 | 
			
		||||
            
 | 
			
		||||
            # Add all predecessors (nodes pointing TO the correlation node)
 | 
			
		||||
            connected_nodes.update(self.graph.predecessors(corr_node_id))
 | 
			
		||||
            
 | 
			
		||||
            # Add all successors (nodes pointed TO by the correlation node)  
 | 
			
		||||
            connected_nodes.update(self.graph.successors(corr_node_id))
 | 
			
		||||
            
 | 
			
		||||
            # Check if the pattern matches exactly
 | 
			
		||||
            if connected_nodes == node_set:
 | 
			
		||||
                matching_nodes.append(corr_node_id)
 | 
			
		||||
        
 | 
			
		||||
        return matching_nodes
 | 
			
		||||
 | 
			
		||||
    def _merge_correlation_values(self, target_node_id: str, new_value: Any, corr_data: Dict) -> None:
 | 
			
		||||
        """
 | 
			
		||||
        Merge a new correlation value into an existing correlation node.
 | 
			
		||||
        Uses same logic as large entity merging.
 | 
			
		||||
        """
 | 
			
		||||
        if not self.graph.has_node(target_node_id):
 | 
			
		||||
            return
 | 
			
		||||
            
 | 
			
		||||
        target_metadata = self.graph.nodes[target_node_id]['metadata']
 | 
			
		||||
        
 | 
			
		||||
        # Get existing values (ensure it's a list)
 | 
			
		||||
        existing_values = target_metadata.get('values', [])
 | 
			
		||||
        if not isinstance(existing_values, list):
 | 
			
		||||
            existing_values = [existing_values]
 | 
			
		||||
        
 | 
			
		||||
        # Add new value if not already present
 | 
			
		||||
        if new_value not in existing_values:
 | 
			
		||||
            existing_values.append(new_value)
 | 
			
		||||
            
 | 
			
		||||
        # Merge sources
 | 
			
		||||
        existing_sources = target_metadata.get('sources', [])
 | 
			
		||||
        new_sources = corr_data.get('sources', [])
 | 
			
		||||
        
 | 
			
		||||
        # Create set of unique sources based on (node_id, path) tuples
 | 
			
		||||
        source_set = set()
 | 
			
		||||
        for source in existing_sources + new_sources:
 | 
			
		||||
            source_tuple = (source['node_id'], source.get('path', ''))
 | 
			
		||||
            source_set.add(source_tuple)
 | 
			
		||||
        
 | 
			
		||||
        # Convert back to list of dictionaries
 | 
			
		||||
        merged_sources = [{'node_id': nid, 'path': path} for nid, path in source_set]
 | 
			
		||||
        
 | 
			
		||||
        # Update metadata
 | 
			
		||||
        target_metadata.update({
 | 
			
		||||
            'values': existing_values,
 | 
			
		||||
            'sources': merged_sources,
 | 
			
		||||
            'correlated_nodes': list(set(target_metadata.get('correlated_nodes', []) + corr_data.get('nodes', []))),
 | 
			
		||||
            'merge_count': len(existing_values),
 | 
			
		||||
            'last_merge_timestamp': datetime.now(timezone.utc).isoformat()
 | 
			
		||||
        })
 | 
			
		||||
        
 | 
			
		||||
        # Update description to reflect merged nature
 | 
			
		||||
        value_count = len(existing_values)
 | 
			
		||||
        node_count = len(target_metadata['correlated_nodes'])
 | 
			
		||||
        self.graph.nodes[target_node_id]['description'] = (
 | 
			
		||||
            f"Correlation container with {value_count} merged values "
 | 
			
		||||
            f"across {node_count} nodes"
 | 
			
		||||
        )
 | 
			
		||||
        
 | 
			
		||||
    def add_node(self, node_id: str, node_type: NodeType, attributes: Optional[List[Dict[str, Any]]] = None,
 | 
			
		||||
                description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
 | 
			
		||||
@ -416,28 +152,6 @@ class GraphManager:
 | 
			
		||||
        # Remove node from the graph (NetworkX handles removing connected edges)
 | 
			
		||||
        self.graph.remove_node(node_id)
 | 
			
		||||
        
 | 
			
		||||
        # Clean up the correlation index
 | 
			
		||||
        keys_to_delete = []
 | 
			
		||||
        for value, data in self.correlation_index.items():
 | 
			
		||||
            if isinstance(data, dict) and 'nodes' in data:
 | 
			
		||||
                # Updated correlation structure
 | 
			
		||||
                if node_id in data['nodes']:
 | 
			
		||||
                    data['nodes'].discard(node_id)
 | 
			
		||||
                    # Remove sources for this node
 | 
			
		||||
                    data['sources'] = [s for s in data['sources'] if s['node_id'] != node_id]
 | 
			
		||||
                if not data['nodes']:  # If no other nodes are associated, remove it
 | 
			
		||||
                    keys_to_delete.append(value)
 | 
			
		||||
            else:
 | 
			
		||||
                # Legacy correlation structure (fallback)
 | 
			
		||||
                if isinstance(data, set) and node_id in data:
 | 
			
		||||
                    data.discard(node_id)
 | 
			
		||||
                if not data:
 | 
			
		||||
                    keys_to_delete.append(value)
 | 
			
		||||
        
 | 
			
		||||
        for key in keys_to_delete:
 | 
			
		||||
            if key in self.correlation_index:
 | 
			
		||||
                del self.correlation_index[key]
 | 
			
		||||
 | 
			
		||||
        self.last_modified = datetime.now(timezone.utc).isoformat()
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
@ -562,8 +276,7 @@ class GraphManager:
 | 
			
		||||
        return stats
 | 
			
		||||
 | 
			
		||||
    def clear(self) -> None:
 | 
			
		||||
        """Clear all nodes, edges, and indices from the graph."""
 | 
			
		||||
        """Clear all nodes and edges from the graph."""
 | 
			
		||||
        self.graph.clear()
 | 
			
		||||
        self.correlation_index.clear()
 | 
			
		||||
        self.creation_time = datetime.now(timezone.utc).isoformat()
 | 
			
		||||
        self.last_modified = self.creation_time
 | 
			
		||||
@ -6,6 +6,7 @@ import os
 | 
			
		||||
import importlib
 | 
			
		||||
import redis
 | 
			
		||||
import time
 | 
			
		||||
import math
 | 
			
		||||
import random # Imported for jitter
 | 
			
		||||
from typing import List, Set, Dict, Any, Tuple, Optional
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor
 | 
			
		||||
@ -19,6 +20,7 @@ from core.provider_result import ProviderResult
 | 
			
		||||
from utils.helpers import _is_valid_ip, _is_valid_domain
 | 
			
		||||
from utils.export_manager import export_manager
 | 
			
		||||
from providers.base_provider import BaseProvider
 | 
			
		||||
from providers.correlation_provider import CorrelationProvider
 | 
			
		||||
from core.rate_limiter import GlobalRateLimiter
 | 
			
		||||
 | 
			
		||||
class ScanStatus:
 | 
			
		||||
@ -196,12 +198,15 @@ class Scanner:
 | 
			
		||||
                        attribute = getattr(module, attribute_name)
 | 
			
		||||
                        if isinstance(attribute, type) and issubclass(attribute, BaseProvider) and attribute is not BaseProvider:
 | 
			
		||||
                            provider_class = attribute
 | 
			
		||||
                            # FIXED: Pass the 'name' argument during initialization
 | 
			
		||||
                            provider = provider_class(name=attribute_name, session_config=self.config)
 | 
			
		||||
                            provider_name = provider.get_name()
 | 
			
		||||
 | 
			
		||||
                            if self.config.is_provider_enabled(provider_name):
 | 
			
		||||
                                if provider.is_available():
 | 
			
		||||
                                    provider.set_stop_event(self.stop_event)
 | 
			
		||||
                                    if isinstance(provider, CorrelationProvider):
 | 
			
		||||
                                        provider.set_graph_manager(self.graph)
 | 
			
		||||
                                    self.providers.append(provider)
 | 
			
		||||
                except Exception as e:
 | 
			
		||||
                    traceback.print_exc()
 | 
			
		||||
@ -336,12 +341,20 @@ class Scanner:
 | 
			
		||||
 | 
			
		||||
    def _get_priority(self, provider_name):
 | 
			
		||||
        rate_limit = self.config.get_rate_limit(provider_name)
 | 
			
		||||
        if rate_limit > 90:
 | 
			
		||||
            return 1 # Highest priority
 | 
			
		||||
        elif rate_limit > 50:
 | 
			
		||||
            return 2
 | 
			
		||||
        else:
 | 
			
		||||
            return 3 # Lowest priority
 | 
			
		||||
 | 
			
		||||
        # Define the logarithmic scale
 | 
			
		||||
        if rate_limit < 10:
 | 
			
		||||
            return 10  # Highest priority number (lowest priority) for very low rate limits
 | 
			
		||||
 | 
			
		||||
        # Calculate logarithmic value and map to priority levels
 | 
			
		||||
        # Lower rate limits get higher priority numbers (lower priority)
 | 
			
		||||
        log_value = math.log10(rate_limit)
 | 
			
		||||
        priority = 10 - int(log_value * 2)  # Scale factor to get more granular levels
 | 
			
		||||
 | 
			
		||||
        # Ensure priority is within a reasonable range (1-10)
 | 
			
		||||
        priority = max(1, min(10, priority))
 | 
			
		||||
 | 
			
		||||
        return priority
 | 
			
		||||
 | 
			
		||||
    def _execute_scan(self, target: str, max_depth: int) -> None:
 | 
			
		||||
        """
 | 
			
		||||
@ -420,7 +433,7 @@ class Scanner:
 | 
			
		||||
                    provider = next((p for p in self.providers if p.get_name() == provider_name), None)
 | 
			
		||||
 | 
			
		||||
                    if provider:
 | 
			
		||||
                        new_targets, _, success = self._query_single_provider_for_target(provider, target_item, depth)
 | 
			
		||||
                        new_targets, _, success = self._process_provider_task(provider, target_item, depth)
 | 
			
		||||
 | 
			
		||||
                        if self._is_stop_requested(): break
 | 
			
		||||
 | 
			
		||||
@ -482,9 +495,10 @@ class Scanner:
 | 
			
		||||
                self.executor.shutdown(wait=False, cancel_futures=True)
 | 
			
		||||
                self.executor = None
 | 
			
		||||
 | 
			
		||||
    def _query_single_provider_for_target(self, provider: BaseProvider, target: str, depth: int) -> Tuple[Set[str], Set[str], bool]:
 | 
			
		||||
    def _process_provider_task(self, provider: BaseProvider, target: str, depth: int) -> Tuple[Set[str], Set[str], bool]:
 | 
			
		||||
        """
 | 
			
		||||
        Query a single provider and process the unified ProviderResult.
 | 
			
		||||
        Manages the entire process for a given target and provider.
 | 
			
		||||
        It uses the "worker" function to get the data and then manages the consequences.
 | 
			
		||||
        """
 | 
			
		||||
        if self._is_stop_requested():
 | 
			
		||||
            return set(), set(), False
 | 
			
		||||
@ -500,7 +514,7 @@ class Scanner:
 | 
			
		||||
        provider_successful = True
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            provider_result = self._query_single_provider_unified(provider, target, is_ip, depth)
 | 
			
		||||
            provider_result = self._execute_provider_query(provider, target, is_ip)
 | 
			
		||||
            
 | 
			
		||||
            if provider_result is None:
 | 
			
		||||
                provider_successful = False
 | 
			
		||||
@ -512,16 +526,24 @@ class Scanner:
 | 
			
		||||
                    large_entity_members.update(discovered)
 | 
			
		||||
                else:
 | 
			
		||||
                    new_targets.update(discovered)
 | 
			
		||||
                self.graph.process_correlations_for_node(target)
 | 
			
		||||
                
 | 
			
		||||
                # After processing a provider, queue a correlation task for the target
 | 
			
		||||
                correlation_provider = next((p for p in self.providers if isinstance(p, CorrelationProvider)), None)
 | 
			
		||||
                if correlation_provider and not isinstance(provider, CorrelationProvider):
 | 
			
		||||
                    priority = self._get_priority(correlation_provider.get_name())
 | 
			
		||||
                    self.task_queue.put((time.time(), priority, (correlation_provider.get_name(), target, depth)))
 | 
			
		||||
                    # FIXED: Increment total tasks when a correlation task is enqueued
 | 
			
		||||
                    self.total_tasks_ever_enqueued += 1
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            provider_successful = False
 | 
			
		||||
            self._log_provider_error(target, provider.get_name(), str(e))
 | 
			
		||||
 | 
			
		||||
        return new_targets, large_entity_members, provider_successful
 | 
			
		||||
 | 
			
		||||
    def _query_single_provider_unified(self, provider: BaseProvider, target: str, is_ip: bool, current_depth: int) -> Optional[ProviderResult]:
 | 
			
		||||
    def _execute_provider_query(self, provider: BaseProvider, target: str, is_ip: bool) -> Optional[ProviderResult]:
 | 
			
		||||
        """
 | 
			
		||||
        Query a single provider with stop signal checking.
 | 
			
		||||
        The "worker" function that directly communicates with the provider to fetch data.
 | 
			
		||||
        """
 | 
			
		||||
        provider_name = provider.get_name()
 | 
			
		||||
        start_time = datetime.now(timezone.utc)
 | 
			
		||||
@ -572,16 +594,15 @@ class Scanner:
 | 
			
		||||
            }
 | 
			
		||||
            attributes_by_node[attribute.target_node].append(attr_dict)
 | 
			
		||||
 | 
			
		||||
        # Add attributes to existing nodes (important for ISP nodes to get ASN attributes)
 | 
			
		||||
        # FIXED: Add attributes to existing nodes AND create new nodes (like correlation nodes)
 | 
			
		||||
        for node_id, node_attributes_list in attributes_by_node.items():
 | 
			
		||||
            if self.graph.graph.has_node(node_id):
 | 
			
		||||
                # Node already exists, just add attributes
 | 
			
		||||
                if _is_valid_ip(node_id):
 | 
			
		||||
                    node_type = NodeType.IP
 | 
			
		||||
                else:
 | 
			
		||||
                    node_type = NodeType.DOMAIN
 | 
			
		||||
 | 
			
		||||
                self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
 | 
			
		||||
            if provider_name == 'correlation' and not self.graph.graph.has_node(node_id):
 | 
			
		||||
                node_type = NodeType.CORRELATION_OBJECT
 | 
			
		||||
            elif _is_valid_ip(node_id):
 | 
			
		||||
                node_type = NodeType.IP
 | 
			
		||||
            else:
 | 
			
		||||
                node_type = NodeType.DOMAIN
 | 
			
		||||
            self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
 | 
			
		||||
 | 
			
		||||
        # Check if this should be a large entity
 | 
			
		||||
        if provider_result.get_relationship_count() > self.config.large_entity_threshold:
 | 
			
		||||
@ -604,6 +625,8 @@ class Scanner:
 | 
			
		||||
                target_type = NodeType.ISP  # ISP node for Shodan organization data
 | 
			
		||||
            elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
 | 
			
		||||
                target_type = NodeType.CA   # CA node for certificate issuers
 | 
			
		||||
            elif provider_name == 'correlation':
 | 
			
		||||
                target_type = NodeType.CORRELATION_OBJECT
 | 
			
		||||
            elif _is_valid_ip(target_node):
 | 
			
		||||
                target_type = NodeType.IP
 | 
			
		||||
            else:
 | 
			
		||||
 | 
			
		||||
@ -7,6 +7,7 @@ from .base_provider import BaseProvider
 | 
			
		||||
from .crtsh_provider import CrtShProvider
 | 
			
		||||
from .dns_provider import DNSProvider
 | 
			
		||||
from .shodan_provider import ShodanProvider
 | 
			
		||||
from .correlation_provider import CorrelationProvider
 | 
			
		||||
from core.rate_limiter import GlobalRateLimiter
 | 
			
		||||
 | 
			
		||||
__all__ = [
 | 
			
		||||
@ -14,7 +15,8 @@ __all__ = [
 | 
			
		||||
    'GlobalRateLimiter',
 | 
			
		||||
    'CrtShProvider',
 | 
			
		||||
    'DNSProvider',
 | 
			
		||||
    'ShodanProvider'
 | 
			
		||||
    'ShodanProvider',
 | 
			
		||||
    'CorrelationProvider'
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
__version__ = "0.0.0-rc"
 | 
			
		||||
							
								
								
									
										178
									
								
								providers/correlation_provider.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										178
									
								
								providers/correlation_provider.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,178 @@
 | 
			
		||||
# dnsrecon/providers/correlation_provider.py
 | 
			
		||||
 | 
			
		||||
import re
 | 
			
		||||
from typing import Dict, Any, List
 | 
			
		||||
 | 
			
		||||
from .base_provider import BaseProvider
 | 
			
		||||
from core.provider_result import ProviderResult
 | 
			
		||||
from core.graph_manager import NodeType, GraphManager
 | 
			
		||||
 | 
			
		||||
class CorrelationProvider(BaseProvider):
 | 
			
		||||
    """
 | 
			
		||||
    A provider that finds correlations between nodes in the graph.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, name: str = "correlation", session_config=None):
 | 
			
		||||
        """
 | 
			
		||||
        Initialize the correlation provider.
 | 
			
		||||
        """
 | 
			
		||||
        super().__init__(name, session_config=session_config)
 | 
			
		||||
        self.graph: GraphManager | None = None
 | 
			
		||||
        self.correlation_index = {}
 | 
			
		||||
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
 | 
			
		||||
        self.EXCLUDED_KEYS = [
 | 
			
		||||
            'cert_source',
 | 
			
		||||
            'cert_issuer_ca_id',
 | 
			
		||||
            'cert_common_name',
 | 
			
		||||
            'cert_validity_period_days',
 | 
			
		||||
            'cert_issuer_name',
 | 
			
		||||
            'cert_entry_timestamp',
 | 
			
		||||
            'cert_not_before',
 | 
			
		||||
            'cert_not_after',
 | 
			
		||||
            'dns_ttl',
 | 
			
		||||
            'timestamp',
 | 
			
		||||
            'last_update',
 | 
			
		||||
            'updated_timestamp',
 | 
			
		||||
            'discovery_timestamp',
 | 
			
		||||
            'query_timestamp',
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    def get_name(self) -> str:
 | 
			
		||||
        """Return the provider name."""
 | 
			
		||||
        return "correlation"
 | 
			
		||||
 | 
			
		||||
    def get_display_name(self) -> str:
 | 
			
		||||
        """Return the provider display name for the UI."""
 | 
			
		||||
        return "Correlation Engine"
 | 
			
		||||
 | 
			
		||||
    def requires_api_key(self) -> bool:
 | 
			
		||||
        """Return True if the provider requires an API key."""
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    def get_eligibility(self) -> Dict[str, bool]:
 | 
			
		||||
        """Return a dictionary indicating if the provider can query domains and/or IPs."""
 | 
			
		||||
        return {'domains': True, 'ips': True}
 | 
			
		||||
 | 
			
		||||
    def is_available(self) -> bool:
 | 
			
		||||
        """Check if the provider is available and properly configured."""
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def query_domain(self, domain: str) -> ProviderResult:
 | 
			
		||||
        """
 | 
			
		||||
        Query the provider for information about a domain.
 | 
			
		||||
        """
 | 
			
		||||
        return self._find_correlations(domain)
 | 
			
		||||
 | 
			
		||||
    def query_ip(self, ip: str) -> ProviderResult:
 | 
			
		||||
        """
 | 
			
		||||
        Query the provider for information about an IP address.
 | 
			
		||||
        """
 | 
			
		||||
        return self._find_correlations(ip)
 | 
			
		||||
 | 
			
		||||
    def set_graph_manager(self, graph_manager: GraphManager):
 | 
			
		||||
        """
 | 
			
		||||
        Set the graph manager for the provider to use.
 | 
			
		||||
        """
 | 
			
		||||
        self.graph = graph_manager
 | 
			
		||||
 | 
			
		||||
    def _find_correlations(self, node_id: str) -> ProviderResult:
 | 
			
		||||
        """
 | 
			
		||||
        Find correlations for a given node.
 | 
			
		||||
        """
 | 
			
		||||
        result = ProviderResult()
 | 
			
		||||
        # FIXED: Ensure self.graph is not None before proceeding.
 | 
			
		||||
        if not self.graph or not self.graph.graph.has_node(node_id):
 | 
			
		||||
            return result
 | 
			
		||||
 | 
			
		||||
        node_attributes = self.graph.graph.nodes[node_id].get('attributes', [])
 | 
			
		||||
 | 
			
		||||
        for attr in node_attributes:
 | 
			
		||||
            attr_name = attr.get('name')
 | 
			
		||||
            attr_value = attr.get('value')
 | 
			
		||||
            attr_provider = attr.get('provider', 'unknown')
 | 
			
		||||
 | 
			
		||||
            should_exclude = (
 | 
			
		||||
                any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS) or
 | 
			
		||||
                not isinstance(attr_value, (str, int, float, bool)) or
 | 
			
		||||
                attr_value is None or
 | 
			
		||||
                isinstance(attr_value, bool) or
 | 
			
		||||
                (isinstance(attr_value, str) and (
 | 
			
		||||
                    len(attr_value) < 4 or
 | 
			
		||||
                    self.date_pattern.match(attr_value) or
 | 
			
		||||
                    attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']
 | 
			
		||||
                )) or
 | 
			
		||||
                (isinstance(attr_value, (int, float)) and (
 | 
			
		||||
                    attr_value == 0 or
 | 
			
		||||
                    attr_value == 1 or
 | 
			
		||||
                    abs(attr_value) > 1000000
 | 
			
		||||
                ))
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            if should_exclude:
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if attr_value not in self.correlation_index:
 | 
			
		||||
                self.correlation_index[attr_value] = {
 | 
			
		||||
                    'nodes': set(),
 | 
			
		||||
                    'sources': []
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            self.correlation_index[attr_value]['nodes'].add(node_id)
 | 
			
		||||
 | 
			
		||||
            source_info = {
 | 
			
		||||
                'node_id': node_id,
 | 
			
		||||
                'provider': attr_provider,
 | 
			
		||||
                'attribute': attr_name,
 | 
			
		||||
                'path': f"{attr_provider}_{attr_name}"
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            existing_sources = [s for s in self.correlation_index[attr_value]['sources']
 | 
			
		||||
                              if s['node_id'] == node_id and s['path'] == source_info['path']]
 | 
			
		||||
            if not existing_sources:
 | 
			
		||||
                self.correlation_index[attr_value]['sources'].append(source_info)
 | 
			
		||||
 | 
			
		||||
            if len(self.correlation_index[attr_value]['nodes']) > 1:
 | 
			
		||||
                self._create_correlation_relationships(attr_value, self.correlation_index[attr_value], result)
 | 
			
		||||
        return result
 | 
			
		||||
 | 
			
		||||
    def _create_correlation_relationships(self, value: Any, correlation_data: Dict[str, Any], result: ProviderResult):
 | 
			
		||||
        """
 | 
			
		||||
        Create correlation relationships and add them to the provider result.
 | 
			
		||||
        """
 | 
			
		||||
        correlation_node_id = f"corr_{hash(str(value)) & 0x7FFFFFFF}"
 | 
			
		||||
        nodes = correlation_data['nodes']
 | 
			
		||||
        sources = correlation_data['sources']
 | 
			
		||||
 | 
			
		||||
        # Add the correlation node as an attribute to the result
 | 
			
		||||
        result.add_attribute(
 | 
			
		||||
            target_node=correlation_node_id,
 | 
			
		||||
            name="correlation_value",
 | 
			
		||||
            value=value,
 | 
			
		||||
            attr_type=str(type(value)),
 | 
			
		||||
            provider=self.name,
 | 
			
		||||
            confidence=0.9,
 | 
			
		||||
            metadata={
 | 
			
		||||
                'correlated_nodes': list(nodes),
 | 
			
		||||
                'sources': sources,
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        for source in sources:
 | 
			
		||||
            node_id = source['node_id']
 | 
			
		||||
            provider = source['provider']
 | 
			
		||||
            attribute = source['attribute']
 | 
			
		||||
            relationship_label = f"corr_{provider}_{attribute}"
 | 
			
		||||
 | 
			
		||||
            # Add the relationship to the result
 | 
			
		||||
            result.add_relationship(
 | 
			
		||||
                source_node=node_id,
 | 
			
		||||
                target_node=correlation_node_id,
 | 
			
		||||
                relationship_type=relationship_label,
 | 
			
		||||
                provider=self.name,
 | 
			
		||||
                confidence=0.9,
 | 
			
		||||
                raw_data={
 | 
			
		||||
                    'correlation_value': value,
 | 
			
		||||
                    'original_attribute': attribute,
 | 
			
		||||
                    'correlation_type': 'attribute_matching'
 | 
			
		||||
                }
 | 
			
		||||
            )
 | 
			
		||||
@ -27,14 +27,25 @@ class ShodanProvider(BaseProvider):
 | 
			
		||||
        )
 | 
			
		||||
        self.base_url = "https://api.shodan.io"
 | 
			
		||||
        self.api_key = self.config.get_api_key('shodan')
 | 
			
		||||
        self._is_active = self._check_api_connection()
 | 
			
		||||
 | 
			
		||||
        # Initialize cache directory
 | 
			
		||||
        self.cache_dir = Path('cache') / 'shodan'
 | 
			
		||||
        self.cache_dir.mkdir(parents=True, exist_ok=True)
 | 
			
		||||
 | 
			
		||||
    def _check_api_connection(self) -> bool:
 | 
			
		||||
        """Checks if the Shodan API is reachable."""
 | 
			
		||||
        if not self.api_key:
 | 
			
		||||
            return False
 | 
			
		||||
        try:
 | 
			
		||||
            response = self.session.get(f"{self.base_url}/api-info?key={self.api_key}", timeout=5)
 | 
			
		||||
            return response.status_code == 200
 | 
			
		||||
        except requests.exceptions.RequestException:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def is_available(self) -> bool:
 | 
			
		||||
        """Check if Shodan provider is available (has valid API key in this session)."""
 | 
			
		||||
        return self.api_key is not None and len(self.api_key.strip()) > 0
 | 
			
		||||
        return self._is_active and self.api_key is not None and len(self.api_key.strip()) > 0
 | 
			
		||||
 | 
			
		||||
    def get_name(self) -> str:
 | 
			
		||||
        """Return the provider name."""
 | 
			
		||||
@ -96,18 +107,6 @@ class ShodanProvider(BaseProvider):
 | 
			
		||||
        except (json.JSONDecodeError, ValueError, KeyError):
 | 
			
		||||
            return "stale"
 | 
			
		||||
 | 
			
		||||
    def query_domain(self, domain: str) -> ProviderResult:
 | 
			
		||||
        """
 | 
			
		||||
        Domain queries are no longer supported for the Shodan provider.
 | 
			
		||||
        
 | 
			
		||||
        Args:
 | 
			
		||||
            domain: Domain to investigate
 | 
			
		||||
            
 | 
			
		||||
        Returns:
 | 
			
		||||
            Empty ProviderResult
 | 
			
		||||
        """
 | 
			
		||||
        return ProviderResult()
 | 
			
		||||
 | 
			
		||||
    def query_ip(self, ip: str) -> ProviderResult:
 | 
			
		||||
        """
 | 
			
		||||
        Query Shodan for information about an IP address (IPv4 or IPv6), with caching of processed data.
 | 
			
		||||
 | 
			
		||||
@ -587,26 +587,17 @@ class GraphManager {
 | 
			
		||||
        
 | 
			
		||||
        // Handle merged correlation objects
 | 
			
		||||
        if (node.type === 'correlation_object') {
 | 
			
		||||
            const metadata = node.metadata || {};
 | 
			
		||||
            const values = metadata.values || [];
 | 
			
		||||
            const mergeCount = metadata.merge_count || 1;
 | 
			
		||||
            const correlationValueAttr = this.findAttributeByName(node.attributes, 'correlation_value');
 | 
			
		||||
            const value = correlationValueAttr ? correlationValueAttr.value : 'Unknown';
 | 
			
		||||
            const displayValue = typeof value === 'string' && value.length > 20 ? value.substring(0, 17) + '...' : value;
 | 
			
		||||
 | 
			
		||||
            if (mergeCount > 1) {
 | 
			
		||||
                processedNode.label = `Correlations (${mergeCount})`;
 | 
			
		||||
                processedNode.title = `Merged correlation container with ${mergeCount} values: ${values.slice(0, 3).join(', ')}${values.length > 3 ? '...' : ''}`;
 | 
			
		||||
                processedNode.borderWidth = 3;
 | 
			
		||||
            } else {
 | 
			
		||||
                const value = Array.isArray(values) && values.length > 0 ? values[0] : (metadata.value || 'Unknown');
 | 
			
		||||
                const displayValue = typeof value === 'string' && value.length > 20 ? value.substring(0, 17) + '...' : value;
 | 
			
		||||
                processedNode.label = `${displayValue}`;
 | 
			
		||||
                processedNode.title = `Correlation: ${value}`;
 | 
			
		||||
            }
 | 
			
		||||
            processedNode.label = `${displayValue}`;
 | 
			
		||||
            processedNode.title = `Correlation: ${value}`;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return processedNode;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Process edge data with styling and metadata
 | 
			
		||||
     * @param {Object} edge - Raw edge data
 | 
			
		||||
 | 
			
		||||
@ -1609,15 +1609,19 @@ class DNSReconApp {
 | 
			
		||||
     * UPDATED: Enhanced correlation details showing the correlated attribute clearly (no formatting)
 | 
			
		||||
     */
 | 
			
		||||
    generateCorrelationDetails(node) {
 | 
			
		||||
        const metadata = node.metadata || {};
 | 
			
		||||
        const value = metadata.value;
 | 
			
		||||
        const attributes = node.attributes || [];
 | 
			
		||||
        const correlationValueAttr = attributes.find(attr => attr.name === 'correlation_value');
 | 
			
		||||
        const value = correlationValueAttr ? correlationValueAttr.value : 'Unknown';
 | 
			
		||||
 | 
			
		||||
        const metadataAttr = attributes.find(attr => attr.name === 'correlation_value');
 | 
			
		||||
        const metadata = metadataAttr ? metadataAttr.metadata : {};
 | 
			
		||||
        const correlatedNodes = metadata.correlated_nodes || [];
 | 
			
		||||
        const sources = metadata.sources || [];
 | 
			
		||||
        
 | 
			
		||||
        let html = '';
 | 
			
		||||
        
 | 
			
		||||
        // Show what attribute is being correlated (raw names)
 | 
			
		||||
        const primarySource = metadata.primary_source || 'unknown';
 | 
			
		||||
        const primarySource = sources.length > 0 ? sources[0].attribute : 'unknown';
 | 
			
		||||
        
 | 
			
		||||
        html += `
 | 
			
		||||
            <div class="modal-section">
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user