correlation engine
This commit is contained in:
parent
cbfd40ee98
commit
12f834bb65
@ -33,14 +33,16 @@ class Config:
|
|||||||
self.rate_limits = {
|
self.rate_limits = {
|
||||||
'crtsh': 5,
|
'crtsh': 5,
|
||||||
'shodan': 60,
|
'shodan': 60,
|
||||||
'dns': 100
|
'dns': 100,
|
||||||
|
'correlation': 1000 # Set a high limit as it's a local operation
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- Provider Settings ---
|
# --- Provider Settings ---
|
||||||
self.enabled_providers = {
|
self.enabled_providers = {
|
||||||
'crtsh': True,
|
'crtsh': True,
|
||||||
'dns': True,
|
'dns': True,
|
||||||
'shodan': False
|
'shodan': False,
|
||||||
|
'correlation': True # Enable the new provider by default
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- Logging ---
|
# --- Logging ---
|
||||||
|
|||||||
@ -40,270 +40,6 @@ class GraphManager:
|
|||||||
self.graph = nx.DiGraph()
|
self.graph = nx.DiGraph()
|
||||||
self.creation_time = datetime.now(timezone.utc).isoformat()
|
self.creation_time = datetime.now(timezone.utc).isoformat()
|
||||||
self.last_modified = self.creation_time
|
self.last_modified = self.creation_time
|
||||||
self.correlation_index = {}
|
|
||||||
# Compile regex for date filtering for efficiency
|
|
||||||
self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
|
|
||||||
|
|
||||||
# FIXED: Exclude cert_issuer_name since we already create proper CA relationships
|
|
||||||
self.EXCLUDED_KEYS = [
|
|
||||||
# Certificate metadata that creates noise or has dedicated node types
|
|
||||||
'cert_source', # Always 'crtsh' for crtsh provider
|
|
||||||
'cert_common_name',
|
|
||||||
'cert_validity_period_days', # Numerical, not useful for correlation
|
|
||||||
'cert_issuer_name', # FIXED: Has dedicated CA nodes, don't correlate
|
|
||||||
#'cert_certificate_id', # Unique per certificate
|
|
||||||
#'cert_serial_number', # Unique per certificate
|
|
||||||
'cert_entry_timestamp', # Timestamp, filtered by date regex anyway
|
|
||||||
'cert_not_before', # Date, filtered by date regex anyway
|
|
||||||
'cert_not_after', # Date, filtered by date regex anyway
|
|
||||||
# DNS metadata that creates noise
|
|
||||||
'dns_ttl', # TTL values are not meaningful for correlation
|
|
||||||
# Shodan metadata that might create noise
|
|
||||||
'timestamp', # Generic timestamp fields
|
|
||||||
'last_update', # Generic timestamp fields
|
|
||||||
#'org', # Too generic, causes false correlations
|
|
||||||
#'isp', # Too generic, causes false correlations
|
|
||||||
# Generic noisy attributes
|
|
||||||
'updated_timestamp', # Any timestamp field
|
|
||||||
'discovery_timestamp', # Any timestamp field
|
|
||||||
'query_timestamp', # Any timestamp field
|
|
||||||
]
|
|
||||||
|
|
||||||
def __getstate__(self):
|
|
||||||
"""Prepare GraphManager for pickling, excluding compiled regex."""
|
|
||||||
state = self.__dict__.copy()
|
|
||||||
# Compiled regex patterns are not always picklable
|
|
||||||
if 'date_pattern' in state:
|
|
||||||
del state['date_pattern']
|
|
||||||
return state
|
|
||||||
|
|
||||||
def __setstate__(self, state):
|
|
||||||
"""Restore GraphManager state and recompile regex."""
|
|
||||||
self.__dict__.update(state)
|
|
||||||
self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
|
|
||||||
|
|
||||||
def process_correlations_for_node(self, node_id: str):
|
|
||||||
"""
|
|
||||||
UPDATED: Process correlations for a given node with enhanced tracking.
|
|
||||||
Now properly tracks which attribute/provider created each correlation.
|
|
||||||
"""
|
|
||||||
if not self.graph.has_node(node_id):
|
|
||||||
return
|
|
||||||
|
|
||||||
node_attributes = self.graph.nodes[node_id].get('attributes', [])
|
|
||||||
|
|
||||||
# Process each attribute for potential correlations
|
|
||||||
for attr in node_attributes:
|
|
||||||
attr_name = attr.get('name')
|
|
||||||
attr_value = attr.get('value')
|
|
||||||
attr_provider = attr.get('provider', 'unknown')
|
|
||||||
|
|
||||||
# IMPROVED: More comprehensive exclusion logic
|
|
||||||
should_exclude = (
|
|
||||||
# Check against excluded keys (exact match or substring)
|
|
||||||
any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS) or
|
|
||||||
# Invalid value types
|
|
||||||
not isinstance(attr_value, (str, int, float, bool)) or
|
|
||||||
attr_value is None or
|
|
||||||
# Boolean values are not useful for correlation
|
|
||||||
isinstance(attr_value, bool) or
|
|
||||||
# String values that are too short or are dates
|
|
||||||
(isinstance(attr_value, str) and (
|
|
||||||
len(attr_value) < 4 or
|
|
||||||
self.date_pattern.match(attr_value) or
|
|
||||||
# Exclude common generic values that create noise
|
|
||||||
attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']
|
|
||||||
)) or
|
|
||||||
# Numerical values that are likely to be unique identifiers
|
|
||||||
(isinstance(attr_value, (int, float)) and (
|
|
||||||
attr_value == 0 or # Zero values are not meaningful
|
|
||||||
attr_value == 1 or # One values are too common
|
|
||||||
abs(attr_value) > 1000000 # Very large numbers are likely IDs
|
|
||||||
))
|
|
||||||
)
|
|
||||||
|
|
||||||
if should_exclude:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Initialize correlation tracking for this value
|
|
||||||
if attr_value not in self.correlation_index:
|
|
||||||
self.correlation_index[attr_value] = {
|
|
||||||
'nodes': set(),
|
|
||||||
'sources': [] # Track which provider/attribute combinations contributed
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add this node and source information
|
|
||||||
self.correlation_index[attr_value]['nodes'].add(node_id)
|
|
||||||
|
|
||||||
# Track the source of this correlation value
|
|
||||||
source_info = {
|
|
||||||
'node_id': node_id,
|
|
||||||
'provider': attr_provider,
|
|
||||||
'attribute': attr_name,
|
|
||||||
'path': f"{attr_provider}_{attr_name}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add source if not already present (avoid duplicates)
|
|
||||||
existing_sources = [s for s in self.correlation_index[attr_value]['sources']
|
|
||||||
if s['node_id'] == node_id and s['path'] == source_info['path']]
|
|
||||||
if not existing_sources:
|
|
||||||
self.correlation_index[attr_value]['sources'].append(source_info)
|
|
||||||
|
|
||||||
# Create correlation node if we have multiple nodes with this value
|
|
||||||
if len(self.correlation_index[attr_value]['nodes']) > 1:
|
|
||||||
self._create_enhanced_correlation_node_and_edges(attr_value, self.correlation_index[attr_value])
|
|
||||||
|
|
||||||
def _create_enhanced_correlation_node_and_edges(self, value, correlation_data):
|
|
||||||
"""
|
|
||||||
UPDATED: Create correlation node and edges with raw provider data (no formatting).
|
|
||||||
"""
|
|
||||||
correlation_node_id = f"corr_{hash(str(value)) & 0x7FFFFFFF}"
|
|
||||||
nodes = correlation_data['nodes']
|
|
||||||
sources = correlation_data['sources']
|
|
||||||
|
|
||||||
# Create or update correlation node
|
|
||||||
if not self.graph.has_node(correlation_node_id):
|
|
||||||
# Use raw provider/attribute data - no formatting
|
|
||||||
provider_counts = {}
|
|
||||||
for source in sources:
|
|
||||||
# Keep original provider and attribute names
|
|
||||||
key = f"{source['provider']}_{source['attribute']}"
|
|
||||||
provider_counts[key] = provider_counts.get(key, 0) + 1
|
|
||||||
|
|
||||||
# Use the most common provider/attribute as the primary label (raw)
|
|
||||||
primary_source = max(provider_counts.items(), key=lambda x: x[1])[0] if provider_counts else "unknown_correlation"
|
|
||||||
|
|
||||||
metadata = {
|
|
||||||
'value': value,
|
|
||||||
'correlated_nodes': list(nodes),
|
|
||||||
'sources': sources,
|
|
||||||
'primary_source': primary_source,
|
|
||||||
'correlation_count': len(nodes)
|
|
||||||
}
|
|
||||||
|
|
||||||
self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT, metadata=metadata)
|
|
||||||
#print(f"Created correlation node {correlation_node_id} for value '{value}' with {len(nodes)} nodes")
|
|
||||||
|
|
||||||
# Create edges from each node to the correlation node
|
|
||||||
for source in sources:
|
|
||||||
node_id = source['node_id']
|
|
||||||
provider = source['provider']
|
|
||||||
attribute = source['attribute']
|
|
||||||
|
|
||||||
if self.graph.has_node(node_id) and not self.graph.has_edge(node_id, correlation_node_id):
|
|
||||||
# Format relationship label as "corr_provider_attribute"
|
|
||||||
relationship_label = f"corr_{provider}_{attribute}"
|
|
||||||
|
|
||||||
self.add_edge(
|
|
||||||
source_id=node_id,
|
|
||||||
target_id=correlation_node_id,
|
|
||||||
relationship_type=relationship_label,
|
|
||||||
confidence_score=0.9,
|
|
||||||
source_provider=provider,
|
|
||||||
raw_data={
|
|
||||||
'correlation_value': value,
|
|
||||||
'original_attribute': attribute,
|
|
||||||
'correlation_type': 'attribute_matching'
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
#print(f"Added correlation edge: {node_id} -> {correlation_node_id} ({relationship_label})")
|
|
||||||
|
|
||||||
|
|
||||||
def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
|
|
||||||
"""
|
|
||||||
Check if there's a direct edge between two nodes in either direction.
|
|
||||||
Returns True if node_aâ†'node_b OR node_bâ†'node_a exists.
|
|
||||||
"""
|
|
||||||
return (self.graph.has_edge(node_a, node_b) or
|
|
||||||
self.graph.has_edge(node_b, node_a))
|
|
||||||
|
|
||||||
def _correlation_value_matches_existing_node(self, correlation_value: str) -> bool:
|
|
||||||
"""
|
|
||||||
Check if correlation value contains any existing node ID as substring.
|
|
||||||
Returns True if match found (correlation node should NOT be created).
|
|
||||||
"""
|
|
||||||
correlation_str = str(correlation_value).lower()
|
|
||||||
|
|
||||||
# Check against all existing nodes
|
|
||||||
for existing_node_id in self.graph.nodes():
|
|
||||||
if existing_node_id.lower() in correlation_str:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _find_correlation_nodes_with_same_pattern(self, node_set: set) -> List[str]:
|
|
||||||
"""
|
|
||||||
Find existing correlation nodes that have the exact same pattern of connected nodes.
|
|
||||||
Returns list of correlation node IDs with matching patterns.
|
|
||||||
"""
|
|
||||||
correlation_nodes = self.get_nodes_by_type(NodeType.CORRELATION_OBJECT)
|
|
||||||
matching_nodes = []
|
|
||||||
|
|
||||||
for corr_node_id in correlation_nodes:
|
|
||||||
# Get all nodes connected to this correlation node
|
|
||||||
connected_nodes = set()
|
|
||||||
|
|
||||||
# Add all predecessors (nodes pointing TO the correlation node)
|
|
||||||
connected_nodes.update(self.graph.predecessors(corr_node_id))
|
|
||||||
|
|
||||||
# Add all successors (nodes pointed TO by the correlation node)
|
|
||||||
connected_nodes.update(self.graph.successors(corr_node_id))
|
|
||||||
|
|
||||||
# Check if the pattern matches exactly
|
|
||||||
if connected_nodes == node_set:
|
|
||||||
matching_nodes.append(corr_node_id)
|
|
||||||
|
|
||||||
return matching_nodes
|
|
||||||
|
|
||||||
def _merge_correlation_values(self, target_node_id: str, new_value: Any, corr_data: Dict) -> None:
|
|
||||||
"""
|
|
||||||
Merge a new correlation value into an existing correlation node.
|
|
||||||
Uses same logic as large entity merging.
|
|
||||||
"""
|
|
||||||
if not self.graph.has_node(target_node_id):
|
|
||||||
return
|
|
||||||
|
|
||||||
target_metadata = self.graph.nodes[target_node_id]['metadata']
|
|
||||||
|
|
||||||
# Get existing values (ensure it's a list)
|
|
||||||
existing_values = target_metadata.get('values', [])
|
|
||||||
if not isinstance(existing_values, list):
|
|
||||||
existing_values = [existing_values]
|
|
||||||
|
|
||||||
# Add new value if not already present
|
|
||||||
if new_value not in existing_values:
|
|
||||||
existing_values.append(new_value)
|
|
||||||
|
|
||||||
# Merge sources
|
|
||||||
existing_sources = target_metadata.get('sources', [])
|
|
||||||
new_sources = corr_data.get('sources', [])
|
|
||||||
|
|
||||||
# Create set of unique sources based on (node_id, path) tuples
|
|
||||||
source_set = set()
|
|
||||||
for source in existing_sources + new_sources:
|
|
||||||
source_tuple = (source['node_id'], source.get('path', ''))
|
|
||||||
source_set.add(source_tuple)
|
|
||||||
|
|
||||||
# Convert back to list of dictionaries
|
|
||||||
merged_sources = [{'node_id': nid, 'path': path} for nid, path in source_set]
|
|
||||||
|
|
||||||
# Update metadata
|
|
||||||
target_metadata.update({
|
|
||||||
'values': existing_values,
|
|
||||||
'sources': merged_sources,
|
|
||||||
'correlated_nodes': list(set(target_metadata.get('correlated_nodes', []) + corr_data.get('nodes', []))),
|
|
||||||
'merge_count': len(existing_values),
|
|
||||||
'last_merge_timestamp': datetime.now(timezone.utc).isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
# Update description to reflect merged nature
|
|
||||||
value_count = len(existing_values)
|
|
||||||
node_count = len(target_metadata['correlated_nodes'])
|
|
||||||
self.graph.nodes[target_node_id]['description'] = (
|
|
||||||
f"Correlation container with {value_count} merged values "
|
|
||||||
f"across {node_count} nodes"
|
|
||||||
)
|
|
||||||
|
|
||||||
def add_node(self, node_id: str, node_type: NodeType, attributes: Optional[List[Dict[str, Any]]] = None,
|
def add_node(self, node_id: str, node_type: NodeType, attributes: Optional[List[Dict[str, Any]]] = None,
|
||||||
description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
|
description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
|
||||||
@ -416,28 +152,6 @@ class GraphManager:
|
|||||||
# Remove node from the graph (NetworkX handles removing connected edges)
|
# Remove node from the graph (NetworkX handles removing connected edges)
|
||||||
self.graph.remove_node(node_id)
|
self.graph.remove_node(node_id)
|
||||||
|
|
||||||
# Clean up the correlation index
|
|
||||||
keys_to_delete = []
|
|
||||||
for value, data in self.correlation_index.items():
|
|
||||||
if isinstance(data, dict) and 'nodes' in data:
|
|
||||||
# Updated correlation structure
|
|
||||||
if node_id in data['nodes']:
|
|
||||||
data['nodes'].discard(node_id)
|
|
||||||
# Remove sources for this node
|
|
||||||
data['sources'] = [s for s in data['sources'] if s['node_id'] != node_id]
|
|
||||||
if not data['nodes']: # If no other nodes are associated, remove it
|
|
||||||
keys_to_delete.append(value)
|
|
||||||
else:
|
|
||||||
# Legacy correlation structure (fallback)
|
|
||||||
if isinstance(data, set) and node_id in data:
|
|
||||||
data.discard(node_id)
|
|
||||||
if not data:
|
|
||||||
keys_to_delete.append(value)
|
|
||||||
|
|
||||||
for key in keys_to_delete:
|
|
||||||
if key in self.correlation_index:
|
|
||||||
del self.correlation_index[key]
|
|
||||||
|
|
||||||
self.last_modified = datetime.now(timezone.utc).isoformat()
|
self.last_modified = datetime.now(timezone.utc).isoformat()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -562,8 +276,7 @@ class GraphManager:
|
|||||||
return stats
|
return stats
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Clear all nodes, edges, and indices from the graph."""
|
"""Clear all nodes and edges from the graph."""
|
||||||
self.graph.clear()
|
self.graph.clear()
|
||||||
self.correlation_index.clear()
|
|
||||||
self.creation_time = datetime.now(timezone.utc).isoformat()
|
self.creation_time = datetime.now(timezone.utc).isoformat()
|
||||||
self.last_modified = self.creation_time
|
self.last_modified = self.creation_time
|
||||||
@ -6,6 +6,7 @@ import os
|
|||||||
import importlib
|
import importlib
|
||||||
import redis
|
import redis
|
||||||
import time
|
import time
|
||||||
|
import math
|
||||||
import random # Imported for jitter
|
import random # Imported for jitter
|
||||||
from typing import List, Set, Dict, Any, Tuple, Optional
|
from typing import List, Set, Dict, Any, Tuple, Optional
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
@ -19,6 +20,7 @@ from core.provider_result import ProviderResult
|
|||||||
from utils.helpers import _is_valid_ip, _is_valid_domain
|
from utils.helpers import _is_valid_ip, _is_valid_domain
|
||||||
from utils.export_manager import export_manager
|
from utils.export_manager import export_manager
|
||||||
from providers.base_provider import BaseProvider
|
from providers.base_provider import BaseProvider
|
||||||
|
from providers.correlation_provider import CorrelationProvider
|
||||||
from core.rate_limiter import GlobalRateLimiter
|
from core.rate_limiter import GlobalRateLimiter
|
||||||
|
|
||||||
class ScanStatus:
|
class ScanStatus:
|
||||||
@ -196,12 +198,15 @@ class Scanner:
|
|||||||
attribute = getattr(module, attribute_name)
|
attribute = getattr(module, attribute_name)
|
||||||
if isinstance(attribute, type) and issubclass(attribute, BaseProvider) and attribute is not BaseProvider:
|
if isinstance(attribute, type) and issubclass(attribute, BaseProvider) and attribute is not BaseProvider:
|
||||||
provider_class = attribute
|
provider_class = attribute
|
||||||
|
# FIXED: Pass the 'name' argument during initialization
|
||||||
provider = provider_class(name=attribute_name, session_config=self.config)
|
provider = provider_class(name=attribute_name, session_config=self.config)
|
||||||
provider_name = provider.get_name()
|
provider_name = provider.get_name()
|
||||||
|
|
||||||
if self.config.is_provider_enabled(provider_name):
|
if self.config.is_provider_enabled(provider_name):
|
||||||
if provider.is_available():
|
if provider.is_available():
|
||||||
provider.set_stop_event(self.stop_event)
|
provider.set_stop_event(self.stop_event)
|
||||||
|
if isinstance(provider, CorrelationProvider):
|
||||||
|
provider.set_graph_manager(self.graph)
|
||||||
self.providers.append(provider)
|
self.providers.append(provider)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
@ -336,12 +341,20 @@ class Scanner:
|
|||||||
|
|
||||||
def _get_priority(self, provider_name):
|
def _get_priority(self, provider_name):
|
||||||
rate_limit = self.config.get_rate_limit(provider_name)
|
rate_limit = self.config.get_rate_limit(provider_name)
|
||||||
if rate_limit > 90:
|
|
||||||
return 1 # Highest priority
|
# Define the logarithmic scale
|
||||||
elif rate_limit > 50:
|
if rate_limit < 10:
|
||||||
return 2
|
return 10 # Highest priority number (lowest priority) for very low rate limits
|
||||||
else:
|
|
||||||
return 3 # Lowest priority
|
# Calculate logarithmic value and map to priority levels
|
||||||
|
# Lower rate limits get higher priority numbers (lower priority)
|
||||||
|
log_value = math.log10(rate_limit)
|
||||||
|
priority = 10 - int(log_value * 2) # Scale factor to get more granular levels
|
||||||
|
|
||||||
|
# Ensure priority is within a reasonable range (1-10)
|
||||||
|
priority = max(1, min(10, priority))
|
||||||
|
|
||||||
|
return priority
|
||||||
|
|
||||||
def _execute_scan(self, target: str, max_depth: int) -> None:
|
def _execute_scan(self, target: str, max_depth: int) -> None:
|
||||||
"""
|
"""
|
||||||
@ -420,7 +433,7 @@ class Scanner:
|
|||||||
provider = next((p for p in self.providers if p.get_name() == provider_name), None)
|
provider = next((p for p in self.providers if p.get_name() == provider_name), None)
|
||||||
|
|
||||||
if provider:
|
if provider:
|
||||||
new_targets, _, success = self._query_single_provider_for_target(provider, target_item, depth)
|
new_targets, _, success = self._process_provider_task(provider, target_item, depth)
|
||||||
|
|
||||||
if self._is_stop_requested(): break
|
if self._is_stop_requested(): break
|
||||||
|
|
||||||
@ -482,9 +495,10 @@ class Scanner:
|
|||||||
self.executor.shutdown(wait=False, cancel_futures=True)
|
self.executor.shutdown(wait=False, cancel_futures=True)
|
||||||
self.executor = None
|
self.executor = None
|
||||||
|
|
||||||
def _query_single_provider_for_target(self, provider: BaseProvider, target: str, depth: int) -> Tuple[Set[str], Set[str], bool]:
|
def _process_provider_task(self, provider: BaseProvider, target: str, depth: int) -> Tuple[Set[str], Set[str], bool]:
|
||||||
"""
|
"""
|
||||||
Query a single provider and process the unified ProviderResult.
|
Manages the entire process for a given target and provider.
|
||||||
|
It uses the "worker" function to get the data and then manages the consequences.
|
||||||
"""
|
"""
|
||||||
if self._is_stop_requested():
|
if self._is_stop_requested():
|
||||||
return set(), set(), False
|
return set(), set(), False
|
||||||
@ -500,7 +514,7 @@ class Scanner:
|
|||||||
provider_successful = True
|
provider_successful = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
provider_result = self._query_single_provider_unified(provider, target, is_ip, depth)
|
provider_result = self._execute_provider_query(provider, target, is_ip)
|
||||||
|
|
||||||
if provider_result is None:
|
if provider_result is None:
|
||||||
provider_successful = False
|
provider_successful = False
|
||||||
@ -512,16 +526,24 @@ class Scanner:
|
|||||||
large_entity_members.update(discovered)
|
large_entity_members.update(discovered)
|
||||||
else:
|
else:
|
||||||
new_targets.update(discovered)
|
new_targets.update(discovered)
|
||||||
self.graph.process_correlations_for_node(target)
|
|
||||||
|
# After processing a provider, queue a correlation task for the target
|
||||||
|
correlation_provider = next((p for p in self.providers if isinstance(p, CorrelationProvider)), None)
|
||||||
|
if correlation_provider and not isinstance(provider, CorrelationProvider):
|
||||||
|
priority = self._get_priority(correlation_provider.get_name())
|
||||||
|
self.task_queue.put((time.time(), priority, (correlation_provider.get_name(), target, depth)))
|
||||||
|
# FIXED: Increment total tasks when a correlation task is enqueued
|
||||||
|
self.total_tasks_ever_enqueued += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
provider_successful = False
|
provider_successful = False
|
||||||
self._log_provider_error(target, provider.get_name(), str(e))
|
self._log_provider_error(target, provider.get_name(), str(e))
|
||||||
|
|
||||||
return new_targets, large_entity_members, provider_successful
|
return new_targets, large_entity_members, provider_successful
|
||||||
|
|
||||||
def _query_single_provider_unified(self, provider: BaseProvider, target: str, is_ip: bool, current_depth: int) -> Optional[ProviderResult]:
|
def _execute_provider_query(self, provider: BaseProvider, target: str, is_ip: bool) -> Optional[ProviderResult]:
|
||||||
"""
|
"""
|
||||||
Query a single provider with stop signal checking.
|
The "worker" function that directly communicates with the provider to fetch data.
|
||||||
"""
|
"""
|
||||||
provider_name = provider.get_name()
|
provider_name = provider.get_name()
|
||||||
start_time = datetime.now(timezone.utc)
|
start_time = datetime.now(timezone.utc)
|
||||||
@ -572,15 +594,14 @@ class Scanner:
|
|||||||
}
|
}
|
||||||
attributes_by_node[attribute.target_node].append(attr_dict)
|
attributes_by_node[attribute.target_node].append(attr_dict)
|
||||||
|
|
||||||
# Add attributes to existing nodes (important for ISP nodes to get ASN attributes)
|
# FIXED: Add attributes to existing nodes AND create new nodes (like correlation nodes)
|
||||||
for node_id, node_attributes_list in attributes_by_node.items():
|
for node_id, node_attributes_list in attributes_by_node.items():
|
||||||
if self.graph.graph.has_node(node_id):
|
if provider_name == 'correlation' and not self.graph.graph.has_node(node_id):
|
||||||
# Node already exists, just add attributes
|
node_type = NodeType.CORRELATION_OBJECT
|
||||||
if _is_valid_ip(node_id):
|
elif _is_valid_ip(node_id):
|
||||||
node_type = NodeType.IP
|
node_type = NodeType.IP
|
||||||
else:
|
else:
|
||||||
node_type = NodeType.DOMAIN
|
node_type = NodeType.DOMAIN
|
||||||
|
|
||||||
self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
|
self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
|
||||||
|
|
||||||
# Check if this should be a large entity
|
# Check if this should be a large entity
|
||||||
@ -604,6 +625,8 @@ class Scanner:
|
|||||||
target_type = NodeType.ISP # ISP node for Shodan organization data
|
target_type = NodeType.ISP # ISP node for Shodan organization data
|
||||||
elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
|
elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
|
||||||
target_type = NodeType.CA # CA node for certificate issuers
|
target_type = NodeType.CA # CA node for certificate issuers
|
||||||
|
elif provider_name == 'correlation':
|
||||||
|
target_type = NodeType.CORRELATION_OBJECT
|
||||||
elif _is_valid_ip(target_node):
|
elif _is_valid_ip(target_node):
|
||||||
target_type = NodeType.IP
|
target_type = NodeType.IP
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -7,6 +7,7 @@ from .base_provider import BaseProvider
|
|||||||
from .crtsh_provider import CrtShProvider
|
from .crtsh_provider import CrtShProvider
|
||||||
from .dns_provider import DNSProvider
|
from .dns_provider import DNSProvider
|
||||||
from .shodan_provider import ShodanProvider
|
from .shodan_provider import ShodanProvider
|
||||||
|
from .correlation_provider import CorrelationProvider
|
||||||
from core.rate_limiter import GlobalRateLimiter
|
from core.rate_limiter import GlobalRateLimiter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -14,7 +15,8 @@ __all__ = [
|
|||||||
'GlobalRateLimiter',
|
'GlobalRateLimiter',
|
||||||
'CrtShProvider',
|
'CrtShProvider',
|
||||||
'DNSProvider',
|
'DNSProvider',
|
||||||
'ShodanProvider'
|
'ShodanProvider',
|
||||||
|
'CorrelationProvider'
|
||||||
]
|
]
|
||||||
|
|
||||||
__version__ = "0.0.0-rc"
|
__version__ = "0.0.0-rc"
|
||||||
178
providers/correlation_provider.py
Normal file
178
providers/correlation_provider.py
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
# dnsrecon/providers/correlation_provider.py
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
|
from .base_provider import BaseProvider
|
||||||
|
from core.provider_result import ProviderResult
|
||||||
|
from core.graph_manager import NodeType, GraphManager
|
||||||
|
|
||||||
|
class CorrelationProvider(BaseProvider):
|
||||||
|
"""
|
||||||
|
A provider that finds correlations between nodes in the graph.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name: str = "correlation", session_config=None):
|
||||||
|
"""
|
||||||
|
Initialize the correlation provider.
|
||||||
|
"""
|
||||||
|
super().__init__(name, session_config=session_config)
|
||||||
|
self.graph: GraphManager | None = None
|
||||||
|
self.correlation_index = {}
|
||||||
|
self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
|
||||||
|
self.EXCLUDED_KEYS = [
|
||||||
|
'cert_source',
|
||||||
|
'cert_issuer_ca_id',
|
||||||
|
'cert_common_name',
|
||||||
|
'cert_validity_period_days',
|
||||||
|
'cert_issuer_name',
|
||||||
|
'cert_entry_timestamp',
|
||||||
|
'cert_not_before',
|
||||||
|
'cert_not_after',
|
||||||
|
'dns_ttl',
|
||||||
|
'timestamp',
|
||||||
|
'last_update',
|
||||||
|
'updated_timestamp',
|
||||||
|
'discovery_timestamp',
|
||||||
|
'query_timestamp',
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_name(self) -> str:
|
||||||
|
"""Return the provider name."""
|
||||||
|
return "correlation"
|
||||||
|
|
||||||
|
def get_display_name(self) -> str:
|
||||||
|
"""Return the provider display name for the UI."""
|
||||||
|
return "Correlation Engine"
|
||||||
|
|
||||||
|
def requires_api_key(self) -> bool:
|
||||||
|
"""Return True if the provider requires an API key."""
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_eligibility(self) -> Dict[str, bool]:
|
||||||
|
"""Return a dictionary indicating if the provider can query domains and/or IPs."""
|
||||||
|
return {'domains': True, 'ips': True}
|
||||||
|
|
||||||
|
def is_available(self) -> bool:
|
||||||
|
"""Check if the provider is available and properly configured."""
|
||||||
|
return True
|
||||||
|
|
||||||
|
def query_domain(self, domain: str) -> ProviderResult:
|
||||||
|
"""
|
||||||
|
Query the provider for information about a domain.
|
||||||
|
"""
|
||||||
|
return self._find_correlations(domain)
|
||||||
|
|
||||||
|
def query_ip(self, ip: str) -> ProviderResult:
|
||||||
|
"""
|
||||||
|
Query the provider for information about an IP address.
|
||||||
|
"""
|
||||||
|
return self._find_correlations(ip)
|
||||||
|
|
||||||
|
def set_graph_manager(self, graph_manager: GraphManager):
|
||||||
|
"""
|
||||||
|
Set the graph manager for the provider to use.
|
||||||
|
"""
|
||||||
|
self.graph = graph_manager
|
||||||
|
|
||||||
|
def _find_correlations(self, node_id: str) -> ProviderResult:
|
||||||
|
"""
|
||||||
|
Find correlations for a given node.
|
||||||
|
"""
|
||||||
|
result = ProviderResult()
|
||||||
|
# FIXED: Ensure self.graph is not None before proceeding.
|
||||||
|
if not self.graph or not self.graph.graph.has_node(node_id):
|
||||||
|
return result
|
||||||
|
|
||||||
|
node_attributes = self.graph.graph.nodes[node_id].get('attributes', [])
|
||||||
|
|
||||||
|
for attr in node_attributes:
|
||||||
|
attr_name = attr.get('name')
|
||||||
|
attr_value = attr.get('value')
|
||||||
|
attr_provider = attr.get('provider', 'unknown')
|
||||||
|
|
||||||
|
should_exclude = (
|
||||||
|
any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS) or
|
||||||
|
not isinstance(attr_value, (str, int, float, bool)) or
|
||||||
|
attr_value is None or
|
||||||
|
isinstance(attr_value, bool) or
|
||||||
|
(isinstance(attr_value, str) and (
|
||||||
|
len(attr_value) < 4 or
|
||||||
|
self.date_pattern.match(attr_value) or
|
||||||
|
attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']
|
||||||
|
)) or
|
||||||
|
(isinstance(attr_value, (int, float)) and (
|
||||||
|
attr_value == 0 or
|
||||||
|
attr_value == 1 or
|
||||||
|
abs(attr_value) > 1000000
|
||||||
|
))
|
||||||
|
)
|
||||||
|
|
||||||
|
if should_exclude:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if attr_value not in self.correlation_index:
|
||||||
|
self.correlation_index[attr_value] = {
|
||||||
|
'nodes': set(),
|
||||||
|
'sources': []
|
||||||
|
}
|
||||||
|
|
||||||
|
self.correlation_index[attr_value]['nodes'].add(node_id)
|
||||||
|
|
||||||
|
source_info = {
|
||||||
|
'node_id': node_id,
|
||||||
|
'provider': attr_provider,
|
||||||
|
'attribute': attr_name,
|
||||||
|
'path': f"{attr_provider}_{attr_name}"
|
||||||
|
}
|
||||||
|
|
||||||
|
existing_sources = [s for s in self.correlation_index[attr_value]['sources']
|
||||||
|
if s['node_id'] == node_id and s['path'] == source_info['path']]
|
||||||
|
if not existing_sources:
|
||||||
|
self.correlation_index[attr_value]['sources'].append(source_info)
|
||||||
|
|
||||||
|
if len(self.correlation_index[attr_value]['nodes']) > 1:
|
||||||
|
self._create_correlation_relationships(attr_value, self.correlation_index[attr_value], result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _create_correlation_relationships(self, value: Any, correlation_data: Dict[str, Any], result: ProviderResult):
|
||||||
|
"""
|
||||||
|
Create correlation relationships and add them to the provider result.
|
||||||
|
"""
|
||||||
|
correlation_node_id = f"corr_{hash(str(value)) & 0x7FFFFFFF}"
|
||||||
|
nodes = correlation_data['nodes']
|
||||||
|
sources = correlation_data['sources']
|
||||||
|
|
||||||
|
# Add the correlation node as an attribute to the result
|
||||||
|
result.add_attribute(
|
||||||
|
target_node=correlation_node_id,
|
||||||
|
name="correlation_value",
|
||||||
|
value=value,
|
||||||
|
attr_type=str(type(value)),
|
||||||
|
provider=self.name,
|
||||||
|
confidence=0.9,
|
||||||
|
metadata={
|
||||||
|
'correlated_nodes': list(nodes),
|
||||||
|
'sources': sources,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
for source in sources:
|
||||||
|
node_id = source['node_id']
|
||||||
|
provider = source['provider']
|
||||||
|
attribute = source['attribute']
|
||||||
|
relationship_label = f"corr_{provider}_{attribute}"
|
||||||
|
|
||||||
|
# Add the relationship to the result
|
||||||
|
result.add_relationship(
|
||||||
|
source_node=node_id,
|
||||||
|
target_node=correlation_node_id,
|
||||||
|
relationship_type=relationship_label,
|
||||||
|
provider=self.name,
|
||||||
|
confidence=0.9,
|
||||||
|
raw_data={
|
||||||
|
'correlation_value': value,
|
||||||
|
'original_attribute': attribute,
|
||||||
|
'correlation_type': 'attribute_matching'
|
||||||
|
}
|
||||||
|
)
|
||||||
@ -27,14 +27,25 @@ class ShodanProvider(BaseProvider):
|
|||||||
)
|
)
|
||||||
self.base_url = "https://api.shodan.io"
|
self.base_url = "https://api.shodan.io"
|
||||||
self.api_key = self.config.get_api_key('shodan')
|
self.api_key = self.config.get_api_key('shodan')
|
||||||
|
self._is_active = self._check_api_connection()
|
||||||
|
|
||||||
# Initialize cache directory
|
# Initialize cache directory
|
||||||
self.cache_dir = Path('cache') / 'shodan'
|
self.cache_dir = Path('cache') / 'shodan'
|
||||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _check_api_connection(self) -> bool:
|
||||||
|
"""Checks if the Shodan API is reachable."""
|
||||||
|
if not self.api_key:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
response = self.session.get(f"{self.base_url}/api-info?key={self.api_key}", timeout=5)
|
||||||
|
return response.status_code == 200
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
return False
|
||||||
|
|
||||||
def is_available(self) -> bool:
|
def is_available(self) -> bool:
|
||||||
"""Check if Shodan provider is available (has valid API key in this session)."""
|
"""Check if Shodan provider is available (has valid API key in this session)."""
|
||||||
return self.api_key is not None and len(self.api_key.strip()) > 0
|
return self._is_active and self.api_key is not None and len(self.api_key.strip()) > 0
|
||||||
|
|
||||||
def get_name(self) -> str:
|
def get_name(self) -> str:
|
||||||
"""Return the provider name."""
|
"""Return the provider name."""
|
||||||
@ -96,18 +107,6 @@ class ShodanProvider(BaseProvider):
|
|||||||
except (json.JSONDecodeError, ValueError, KeyError):
|
except (json.JSONDecodeError, ValueError, KeyError):
|
||||||
return "stale"
|
return "stale"
|
||||||
|
|
||||||
def query_domain(self, domain: str) -> ProviderResult:
|
|
||||||
"""
|
|
||||||
Domain queries are no longer supported for the Shodan provider.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
domain: Domain to investigate
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Empty ProviderResult
|
|
||||||
"""
|
|
||||||
return ProviderResult()
|
|
||||||
|
|
||||||
def query_ip(self, ip: str) -> ProviderResult:
|
def query_ip(self, ip: str) -> ProviderResult:
|
||||||
"""
|
"""
|
||||||
Query Shodan for information about an IP address (IPv4 or IPv6), with caching of processed data.
|
Query Shodan for information about an IP address (IPv4 or IPv6), with caching of processed data.
|
||||||
|
|||||||
@ -587,26 +587,17 @@ class GraphManager {
|
|||||||
|
|
||||||
// Handle merged correlation objects
|
// Handle merged correlation objects
|
||||||
if (node.type === 'correlation_object') {
|
if (node.type === 'correlation_object') {
|
||||||
const metadata = node.metadata || {};
|
const correlationValueAttr = this.findAttributeByName(node.attributes, 'correlation_value');
|
||||||
const values = metadata.values || [];
|
const value = correlationValueAttr ? correlationValueAttr.value : 'Unknown';
|
||||||
const mergeCount = metadata.merge_count || 1;
|
|
||||||
|
|
||||||
if (mergeCount > 1) {
|
|
||||||
processedNode.label = `Correlations (${mergeCount})`;
|
|
||||||
processedNode.title = `Merged correlation container with ${mergeCount} values: ${values.slice(0, 3).join(', ')}${values.length > 3 ? '...' : ''}`;
|
|
||||||
processedNode.borderWidth = 3;
|
|
||||||
} else {
|
|
||||||
const value = Array.isArray(values) && values.length > 0 ? values[0] : (metadata.value || 'Unknown');
|
|
||||||
const displayValue = typeof value === 'string' && value.length > 20 ? value.substring(0, 17) + '...' : value;
|
const displayValue = typeof value === 'string' && value.length > 20 ? value.substring(0, 17) + '...' : value;
|
||||||
|
|
||||||
processedNode.label = `${displayValue}`;
|
processedNode.label = `${displayValue}`;
|
||||||
processedNode.title = `Correlation: ${value}`;
|
processedNode.title = `Correlation: ${value}`;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return processedNode;
|
return processedNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Process edge data with styling and metadata
|
* Process edge data with styling and metadata
|
||||||
* @param {Object} edge - Raw edge data
|
* @param {Object} edge - Raw edge data
|
||||||
|
|||||||
@ -1609,15 +1609,19 @@ class DNSReconApp {
|
|||||||
* UPDATED: Enhanced correlation details showing the correlated attribute clearly (no formatting)
|
* UPDATED: Enhanced correlation details showing the correlated attribute clearly (no formatting)
|
||||||
*/
|
*/
|
||||||
generateCorrelationDetails(node) {
|
generateCorrelationDetails(node) {
|
||||||
const metadata = node.metadata || {};
|
const attributes = node.attributes || [];
|
||||||
const value = metadata.value;
|
const correlationValueAttr = attributes.find(attr => attr.name === 'correlation_value');
|
||||||
|
const value = correlationValueAttr ? correlationValueAttr.value : 'Unknown';
|
||||||
|
|
||||||
|
const metadataAttr = attributes.find(attr => attr.name === 'correlation_value');
|
||||||
|
const metadata = metadataAttr ? metadataAttr.metadata : {};
|
||||||
const correlatedNodes = metadata.correlated_nodes || [];
|
const correlatedNodes = metadata.correlated_nodes || [];
|
||||||
const sources = metadata.sources || [];
|
const sources = metadata.sources || [];
|
||||||
|
|
||||||
let html = '';
|
let html = '';
|
||||||
|
|
||||||
// Show what attribute is being correlated (raw names)
|
// Show what attribute is being correlated (raw names)
|
||||||
const primarySource = metadata.primary_source || 'unknown';
|
const primarySource = sources.length > 0 ? sources[0].attribute : 'unknown';
|
||||||
|
|
||||||
html += `
|
html += `
|
||||||
<div class="modal-section">
|
<div class="modal-section">
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user