new data model refinement
This commit is contained in:
@@ -40,6 +40,7 @@ class GraphManager:
|
||||
self.correlation_index = {}
|
||||
# Compile regex for date filtering for efficiency
|
||||
self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
|
||||
self.EXCLUDED_KEYS = ['confidence', 'provider', 'timestamp', 'type']
|
||||
|
||||
def __getstate__(self):
|
||||
"""Prepare GraphManager for pickling, excluding compiled regex."""
|
||||
@@ -54,145 +55,44 @@ class GraphManager:
|
||||
self.__dict__.update(state)
|
||||
self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
|
||||
|
||||
def _update_correlation_index(self, node_id: str, data: Any, path: List[str] = [], parent_attr: str = ""):
|
||||
"""Recursively traverse metadata and add hashable values to the index with better path tracking."""
|
||||
if path is None:
|
||||
path = []
|
||||
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
self._update_correlation_index(node_id, value, path + [key], key)
|
||||
elif isinstance(data, list):
|
||||
for i, item in enumerate(data):
|
||||
# Instead of just using [i], include the parent attribute context
|
||||
list_path_component = f"[{i}]" if not parent_attr else f"{parent_attr}[{i}]"
|
||||
self._update_correlation_index(node_id, item, path + [list_path_component], parent_attr)
|
||||
else:
|
||||
self._add_to_correlation_index(node_id, data, ".".join(path), parent_attr)
|
||||
|
||||
def _add_to_correlation_index(self, node_id: str, value: Any, path_str: str, parent_attr: str = ""):
|
||||
"""Add a hashable value to the correlation index, filtering out noise."""
|
||||
if not isinstance(value, (str, int, float, bool)) or value is None:
|
||||
def process_correlations_for_node(self, node_id: str):
|
||||
"""Process correlations for a given node based on its attributes."""
|
||||
if not self.graph.has_node(node_id):
|
||||
return
|
||||
|
||||
# Ignore certain paths that contain noisy, non-unique identifiers
|
||||
if any(keyword in path_str.lower() for keyword in ['count', 'total', 'timestamp', 'date']):
|
||||
return
|
||||
node_attributes = self.graph.nodes[node_id].get('attributes', [])
|
||||
for attr in node_attributes:
|
||||
attr_name = attr.get('name')
|
||||
attr_value = attr.get('value')
|
||||
|
||||
# Filter out common low-entropy values and date-like strings
|
||||
if isinstance(value, str):
|
||||
# FIXED: Prevent correlation on date/time strings.
|
||||
if self.date_pattern.match(value):
|
||||
return
|
||||
if len(value) < 4 or value.lower() in ['true', 'false', 'unknown', 'none', 'crt.sh']:
|
||||
return
|
||||
elif isinstance(value, int) and (abs(value) < 1024 or abs(value) > 65535):
|
||||
return # Ignore small integers and common port numbers
|
||||
elif isinstance(value, bool):
|
||||
return # Ignore boolean values
|
||||
if attr_name in self.EXCLUDED_KEYS or not isinstance(attr_value, (str, int, float, bool)) or attr_value is None:
|
||||
continue
|
||||
|
||||
# Add the valuable correlation data to the index
|
||||
if value not in self.correlation_index:
|
||||
self.correlation_index[value] = {}
|
||||
if node_id not in self.correlation_index[value]:
|
||||
self.correlation_index[value][node_id] = []
|
||||
|
||||
# Store both the full path and the parent attribute for better edge labeling
|
||||
correlation_entry = {
|
||||
'path': path_str,
|
||||
'parent_attr': parent_attr,
|
||||
'meaningful_attr': self._extract_meaningful_attribute(path_str, parent_attr)
|
||||
}
|
||||
|
||||
if correlation_entry not in self.correlation_index[value][node_id]:
|
||||
self.correlation_index[value][node_id].append(correlation_entry)
|
||||
|
||||
def _extract_meaningful_attribute(self, path_str: str, parent_attr: str = "") -> str:
|
||||
"""Extract the most meaningful attribute name from a path string."""
|
||||
if not path_str:
|
||||
return "unknown"
|
||||
|
||||
path_parts = path_str.split('.')
|
||||
|
||||
# Look for the last non-array-index part
|
||||
for part in reversed(path_parts):
|
||||
# Skip array indices like [0], [1], etc.
|
||||
if not (part.startswith('[') and part.endswith(']') and part[1:-1].isdigit()):
|
||||
# Clean up compound names like "hostnames[0]" to just "hostnames"
|
||||
clean_part = re.sub(r'\[\d+\]$', '', part)
|
||||
if clean_part:
|
||||
return clean_part
|
||||
|
||||
# Fallback to parent attribute if available
|
||||
if parent_attr:
|
||||
return parent_attr
|
||||
|
||||
# Last resort - use the first meaningful part
|
||||
for part in path_parts:
|
||||
if not (part.startswith('[') and part.endswith(']') and part[1:-1].isdigit()):
|
||||
clean_part = re.sub(r'\[\d+\]$', '', part)
|
||||
if clean_part:
|
||||
return clean_part
|
||||
|
||||
return "correlation"
|
||||
|
||||
def _check_for_correlations(self, new_node_id: str, data: Any, path: List[str] = [], parent_attr: str = "") -> List[Dict]:
|
||||
"""Recursively traverse metadata to find correlations with existing data."""
|
||||
if path is None:
|
||||
path = []
|
||||
|
||||
all_correlations = []
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
if key == 'source': # Avoid correlating on the provider name
|
||||
continue
|
||||
all_correlations.extend(self._check_for_correlations(new_node_id, value, path + [key], key))
|
||||
elif isinstance(data, list):
|
||||
for i, item in enumerate(data):
|
||||
list_path_component = f"[{i}]" if not parent_attr else f"{parent_attr}[{i}]"
|
||||
all_correlations.extend(self._check_for_correlations(new_node_id, item, path + [list_path_component], parent_attr))
|
||||
else:
|
||||
value = data
|
||||
if value in self.correlation_index:
|
||||
existing_nodes_with_paths = self.correlation_index[value]
|
||||
unique_nodes = set(existing_nodes_with_paths.keys())
|
||||
unique_nodes.add(new_node_id)
|
||||
|
||||
if len(unique_nodes) < 2:
|
||||
return all_correlations # Correlation must involve at least two distinct nodes
|
||||
|
||||
new_source = {
|
||||
'node_id': new_node_id,
|
||||
'path': ".".join(path),
|
||||
'parent_attr': parent_attr,
|
||||
'meaningful_attr': self._extract_meaningful_attribute(".".join(path), parent_attr)
|
||||
}
|
||||
all_sources = [new_source]
|
||||
if isinstance(attr_value, bool):
|
||||
continue
|
||||
|
||||
for node_id, path_entries in existing_nodes_with_paths.items():
|
||||
for entry in path_entries:
|
||||
if isinstance(entry, dict):
|
||||
all_sources.append({
|
||||
'node_id': node_id,
|
||||
'path': entry['path'],
|
||||
'parent_attr': entry.get('parent_attr', ''),
|
||||
'meaningful_attr': entry.get('meaningful_attr', self._extract_meaningful_attribute(entry['path'], entry.get('parent_attr', '')))
|
||||
})
|
||||
else:
|
||||
# Handle legacy string-only entries
|
||||
all_sources.append({
|
||||
'node_id': node_id,
|
||||
'path': str(entry),
|
||||
'parent_attr': '',
|
||||
'meaningful_attr': self._extract_meaningful_attribute(str(entry))
|
||||
})
|
||||
if isinstance(attr_value, str) and (len(attr_value) < 4 or self.date_pattern.match(attr_value)):
|
||||
continue
|
||||
|
||||
if attr_value not in self.correlation_index:
|
||||
self.correlation_index[attr_value] = set()
|
||||
|
||||
self.correlation_index[attr_value].add(node_id)
|
||||
|
||||
if len(self.correlation_index[attr_value]) > 1:
|
||||
self._create_correlation_node_and_edges(attr_value, self.correlation_index[attr_value])
|
||||
|
||||
def _create_correlation_node_and_edges(self, value, nodes):
|
||||
"""Create a correlation node and edges to the correlated nodes."""
|
||||
correlation_node_id = f"corr_{value}"
|
||||
if not self.graph.has_node(correlation_node_id):
|
||||
self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT,
|
||||
metadata={'value': value, 'correlated_nodes': list(nodes)})
|
||||
|
||||
for node_id in nodes:
|
||||
if self.graph.has_node(node_id) and not self.graph.has_edge(node_id, correlation_node_id):
|
||||
self.add_edge(node_id, correlation_node_id, "correlation", confidence_score=0.9)
|
||||
|
||||
all_correlations.append({
|
||||
'value': value,
|
||||
'sources': all_sources,
|
||||
'nodes': list(unique_nodes)
|
||||
})
|
||||
return all_correlations
|
||||
|
||||
def add_node(self, node_id: str, node_type: NodeType, attributes: Optional[List[Dict[str, Any]]] = None,
|
||||
description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
|
||||
@@ -232,78 +132,9 @@ class GraphManager:
|
||||
existing_metadata.update(metadata)
|
||||
self.graph.nodes[node_id]['metadata'] = existing_metadata
|
||||
|
||||
if attributes and node_type != NodeType.CORRELATION_OBJECT:
|
||||
correlations = self._check_for_correlations(node_id, attributes)
|
||||
for corr in correlations:
|
||||
value = corr['value']
|
||||
|
||||
# STEP 1: Substring check against all existing nodes
|
||||
if self._correlation_value_matches_existing_node(value):
|
||||
# Skip creating correlation node - would be redundant
|
||||
continue
|
||||
|
||||
eligible_nodes = set(corr['nodes'])
|
||||
|
||||
if len(eligible_nodes) < 2:
|
||||
# Need at least 2 nodes to create a correlation
|
||||
continue
|
||||
|
||||
# STEP 3: Check for existing correlation node with same connection pattern
|
||||
correlation_nodes_with_pattern = self._find_correlation_nodes_with_same_pattern(eligible_nodes)
|
||||
|
||||
if correlation_nodes_with_pattern:
|
||||
# STEP 4: Merge with existing correlation node
|
||||
target_correlation_node = correlation_nodes_with_pattern[0]
|
||||
self._merge_correlation_values(target_correlation_node, value, corr)
|
||||
else:
|
||||
# STEP 5: Create new correlation node for eligible nodes only
|
||||
correlation_node_id = f"corr_{abs(hash(str(sorted(eligible_nodes))))}"
|
||||
self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT,
|
||||
metadata={'values': [value], 'sources': corr['sources'],
|
||||
'correlated_nodes': list(eligible_nodes)})
|
||||
|
||||
# Create edges from eligible nodes to this correlation node with better labeling
|
||||
for c_node_id in eligible_nodes:
|
||||
if self.graph.has_node(c_node_id):
|
||||
# Find the best attribute name for this node
|
||||
meaningful_attr = self._find_best_attribute_name_for_node(c_node_id, corr['sources'])
|
||||
relationship_type = f"c_{meaningful_attr}"
|
||||
self.add_edge(c_node_id, correlation_node_id, relationship_type, confidence_score=0.9)
|
||||
|
||||
self._update_correlation_index(node_id, attributes)
|
||||
|
||||
self.last_modified = datetime.now(timezone.utc).isoformat()
|
||||
return is_new_node
|
||||
|
||||
def _find_best_attribute_name_for_node(self, node_id: str, sources: List[Dict]) -> str:
|
||||
"""Find the best attribute name for a correlation edge by looking at the sources."""
|
||||
node_sources = [s for s in sources if s['node_id'] == node_id]
|
||||
|
||||
if not node_sources:
|
||||
return "correlation"
|
||||
|
||||
# Use the meaningful_attr if available
|
||||
for source in node_sources:
|
||||
meaningful_attr = source.get('meaningful_attr')
|
||||
if meaningful_attr and meaningful_attr != "unknown":
|
||||
return meaningful_attr
|
||||
|
||||
# Fallback to parent_attr
|
||||
for source in node_sources:
|
||||
parent_attr = source.get('parent_attr')
|
||||
if parent_attr:
|
||||
return parent_attr
|
||||
|
||||
# Last resort - extract from path
|
||||
for source in node_sources:
|
||||
path = source.get('path', '')
|
||||
if path:
|
||||
extracted = self._extract_meaningful_attribute(path)
|
||||
if extracted != "unknown":
|
||||
return extracted
|
||||
|
||||
return "correlation"
|
||||
|
||||
def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
|
||||
"""
|
||||
Check if there's a direct edge between two nodes in either direction.
|
||||
|
||||
@@ -506,6 +506,7 @@ class Scanner:
|
||||
large_entity_members.update(discovered)
|
||||
else:
|
||||
new_targets.update(discovered)
|
||||
self.graph.process_correlations_for_node(target)
|
||||
else:
|
||||
print(f"Stop requested after processing results from {provider.get_name()}")
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user