This commit is contained in:
overcuriousity
2025-09-14 01:21:38 +02:00
parent b7a57f1552
commit 2185177a84
4 changed files with 267 additions and 58 deletions

View File

@@ -131,7 +131,7 @@ class GraphManager:
return all_correlations
def add_node(self, node_id: str, node_type: NodeType, attributes: Optional[Dict[str, Any]] = None,
description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
"""Add a node to the graph, update attributes, and process correlations."""
is_new_node = not self.graph.has_node(node_id)
if is_new_node:
@@ -157,51 +157,162 @@ class GraphManager:
correlations = self._check_for_correlations(node_id, attributes)
for corr in correlations:
value = corr['value']
found_major_node_id = None
if isinstance(value, str):
# Check if the value contains ANY existing major node ID from the entire graph
for existing_node in self.graph.nodes():
# Ensure the existing_node is a major type (domain/ip/asn) and is a substring of the correlation value
if (self.graph.nodes[existing_node].get('type') in [NodeType.DOMAIN.value, NodeType.IP.value, NodeType.ASN.value] and
existing_node in value):
found_major_node_id = existing_node
break # Found a major node, no need to check further
if found_major_node_id:
# An existing major node is part of the value; link to it directly.
for c_node_id in set(corr['nodes']):
if self.graph.has_node(c_node_id) and c_node_id != found_major_node_id:
# STEP 1: Substring check against all existing nodes
if self._correlation_value_matches_existing_node(value):
# Skip creating correlation node - would be redundant
continue
# STEP 2: Filter out node pairs that already have direct edges
eligible_nodes = self._filter_nodes_without_direct_edges(set(corr['nodes']))
if len(eligible_nodes) < 2:
# Need at least 2 nodes to create a correlation
continue
# STEP 3: Check for existing correlation node with same connection pattern
correlation_nodes_with_pattern = self._find_correlation_nodes_with_same_pattern(eligible_nodes)
if correlation_nodes_with_pattern:
# STEP 4: Merge with existing correlation node
target_correlation_node = correlation_nodes_with_pattern[0]
self._merge_correlation_values(target_correlation_node, value, corr)
else:
# STEP 5: Create new correlation node for eligible nodes only
correlation_node_id = f"corr_{abs(hash(str(sorted(eligible_nodes))))}"
self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT,
metadata={'values': [value], 'sources': corr['sources'],
'correlated_nodes': list(eligible_nodes)})
# Create edges from eligible nodes to this correlation node
for c_node_id in eligible_nodes:
if self.graph.has_node(c_node_id):
attribute = corr['sources'][0]['path'].split('.')[-1]
relationship_type = f"c_{attribute}"
self.add_edge(c_node_id, found_major_node_id, relationship_type, confidence_score=0.9)
continue # Skip creating a redundant correlation node
# Proceed to create a new correlation node if no major node was found.
correlation_node_id = f"{value}"
if not self.graph.has_node(correlation_node_id):
self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT,
metadata={'value': value, 'sources': corr['sources'],
'correlated_nodes': list(set(corr['nodes']))})
else: # Update existing correlation node
existing_meta = self.graph.nodes[correlation_node_id]['metadata']
existing_nodes = set(existing_meta.get('correlated_nodes', []))
existing_meta['correlated_nodes'] = list(existing_nodes.union(set(corr['nodes'])))
existing_sources = {(s['node_id'], s['path']) for s in existing_meta.get('sources', [])}
for s in corr['sources']:
existing_sources.add((s['node_id'], s['path']))
existing_meta['sources'] = [{'node_id': nid, 'path': p} for nid, p in existing_sources]
for c_node_id in set(corr['nodes']):
attribute = corr['sources'][0]['path'].split('.')[-1]
relationship_type = f"c_{attribute}"
self.add_edge(c_node_id, correlation_node_id, relationship_type, confidence_score=0.9)
self.add_edge(c_node_id, correlation_node_id, relationship_type, confidence_score=0.9)
self._update_correlation_index(node_id, attributes)
self.last_modified = datetime.now(timezone.utc).isoformat()
return is_new_node
def _filter_nodes_without_direct_edges(self, node_set: set) -> set:
"""
Filter out nodes that already have direct edges between them.
Returns set of nodes that should be included in correlation.
"""
nodes_list = list(node_set)
eligible_nodes = set(node_set) # Start with all nodes
# Check all pairs of nodes
for i in range(len(nodes_list)):
for j in range(i + 1, len(nodes_list)):
node_a = nodes_list[i]
node_b = nodes_list[j]
# Check if direct edge exists in either direction
if self._has_direct_edge_bidirectional(node_a, node_b):
# Remove both nodes from eligible set since they're already connected
eligible_nodes.discard(node_a)
eligible_nodes.discard(node_b)
return eligible_nodes
def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
"""
Check if there's a direct edge between two nodes in either direction.
Returns True if node_a→node_b OR node_b→node_a exists.
"""
return (self.graph.has_edge(node_a, node_b) or
self.graph.has_edge(node_b, node_a))
def _correlation_value_matches_existing_node(self, correlation_value: str) -> bool:
"""
Check if correlation value contains any existing node ID as substring.
Returns True if match found (correlation node should NOT be created).
"""
correlation_str = str(correlation_value).lower()
# Check against all existing nodes
for existing_node_id in self.graph.nodes():
if existing_node_id.lower() in correlation_str:
return True
return False
def _find_correlation_nodes_with_same_pattern(self, node_set: set) -> List[str]:
"""
Find existing correlation nodes that have the exact same pattern of connected nodes.
Returns list of correlation node IDs with matching patterns.
"""
correlation_nodes = self.get_nodes_by_type(NodeType.CORRELATION_OBJECT)
matching_nodes = []
for corr_node_id in correlation_nodes:
# Get all nodes connected to this correlation node
connected_nodes = set()
# Add all predecessors (nodes pointing TO the correlation node)
connected_nodes.update(self.graph.predecessors(corr_node_id))
# Add all successors (nodes pointed TO by the correlation node)
connected_nodes.update(self.graph.successors(corr_node_id))
# Check if the pattern matches exactly
if connected_nodes == node_set:
matching_nodes.append(corr_node_id)
return matching_nodes
def _merge_correlation_values(self, target_node_id: str, new_value: Any, corr_data: Dict) -> None:
"""
Merge a new correlation value into an existing correlation node.
Uses same logic as large entity merging.
"""
if not self.graph.has_node(target_node_id):
return
target_metadata = self.graph.nodes[target_node_id]['metadata']
# Get existing values (ensure it's a list)
existing_values = target_metadata.get('values', [])
if not isinstance(existing_values, list):
existing_values = [existing_values]
# Add new value if not already present
if new_value not in existing_values:
existing_values.append(new_value)
# Merge sources
existing_sources = target_metadata.get('sources', [])
new_sources = corr_data.get('sources', [])
# Create set of unique sources based on (node_id, path) tuples
source_set = set()
for source in existing_sources + new_sources:
source_tuple = (source['node_id'], source['path'])
source_set.add(source_tuple)
# Convert back to list of dictionaries
merged_sources = [{'node_id': nid, 'path': path} for nid, path in source_set]
# Update metadata
target_metadata.update({
'values': existing_values,
'sources': merged_sources,
'correlated_nodes': list(set(target_metadata.get('correlated_nodes', []) + corr_data.get('nodes', []))),
'merge_count': len(existing_values),
'last_merge_timestamp': datetime.now(timezone.utc).isoformat()
})
# Update description to reflect merged nature
value_count = len(existing_values)
node_count = len(target_metadata['correlated_nodes'])
self.graph.nodes[target_node_id]['description'] = (
f"Correlation container with {value_count} merged values "
f"across {node_count} nodes"
)
def add_edge(self, source_id: str, target_id: str, relationship_type: str,
confidence_score: float = 0.5, source_provider: str = "unknown",
raw_data: Optional[Dict[str, Any]] = None) -> bool: