new data model refinement

2025-09-16 21:23:02 +02:00
parent 97aa18f788
commit 733e1da640
6 changed files with 129 additions and 647 deletions
--- a/core/graph_manager.py
+++ b/core/graph_manager.py
@@ -40,6 +40,7 @@ class GraphManager:
        self.correlation_index = {}
        # Compile regex for date filtering for efficiency
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
+        self.EXCLUDED_KEYS = ['confidence', 'provider', 'timestamp', 'type']

    def __getstate__(self):
        """Prepare GraphManager for pickling, excluding compiled regex."""
@@ -54,145 +55,44 @@ class GraphManager:
        self.__dict__.update(state)
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')

-    def _update_correlation_index(self, node_id: str, data: Any, path: List[str] = [], parent_attr: str = ""):
-        """Recursively traverse metadata and add hashable values to the index with better path tracking."""
-        if path is None:
-            path = []
-
-        if isinstance(data, dict):
-            for key, value in data.items():
-                self._update_correlation_index(node_id, value, path + [key], key)
-        elif isinstance(data, list):
-            for i, item in enumerate(data):
-                # Instead of just using [i], include the parent attribute context
-                list_path_component = f"[{i}]" if not parent_attr else f"{parent_attr}[{i}]"
-                self._update_correlation_index(node_id, item, path + [list_path_component], parent_attr)
-        else:
-            self._add_to_correlation_index(node_id, data, ".".join(path), parent_attr)
-
-    def _add_to_correlation_index(self, node_id: str, value: Any, path_str: str, parent_attr: str = ""):
-        """Add a hashable value to the correlation index, filtering out noise."""
-        if not isinstance(value, (str, int, float, bool)) or value is None:
+    def process_correlations_for_node(self, node_id: str):
+        """Process correlations for a given node based on its attributes."""
+        if not self.graph.has_node(node_id):
            return

-        # Ignore certain paths that contain noisy, non-unique identifiers
-        if any(keyword in path_str.lower() for keyword in ['count', 'total', 'timestamp', 'date']):
-            return
+        node_attributes = self.graph.nodes[node_id].get('attributes', [])
+        for attr in node_attributes:
+            attr_name = attr.get('name')
+            attr_value = attr.get('value')

-        # Filter out common low-entropy values and date-like strings
-        if isinstance(value, str):
-            # FIXED: Prevent correlation on date/time strings.
-            if self.date_pattern.match(value):
-                return
-            if len(value) < 4 or value.lower() in ['true', 'false', 'unknown', 'none', 'crt.sh']:
-                return
-        elif isinstance(value, int) and (abs(value) < 1024 or abs(value) > 65535):
-            return  # Ignore small integers and common port numbers
-        elif isinstance(value, bool):
-            return  # Ignore boolean values
+            if attr_name in self.EXCLUDED_KEYS or not isinstance(attr_value, (str, int, float, bool)) or attr_value is None:
+                continue

-        # Add the valuable correlation data to the index
-        if value not in self.correlation_index:
-            self.correlation_index[value] = {}
-        if node_id not in self.correlation_index[value]:
-            self.correlation_index[value][node_id] = []
-        
-        # Store both the full path and the parent attribute for better edge labeling
-        correlation_entry = {
-            'path': path_str,
-            'parent_attr': parent_attr,
-            'meaningful_attr': self._extract_meaningful_attribute(path_str, parent_attr)
-        }
-        
-        if correlation_entry not in self.correlation_index[value][node_id]:
-            self.correlation_index[value][node_id].append(correlation_entry)
-
-    def _extract_meaningful_attribute(self, path_str: str, parent_attr: str = "") -> str:
-        """Extract the most meaningful attribute name from a path string."""
-        if not path_str:
-            return "unknown"
-        
-        path_parts = path_str.split('.')
-        
-        # Look for the last non-array-index part
-        for part in reversed(path_parts):
-            # Skip array indices like [0], [1], etc.
-            if not (part.startswith('[') and part.endswith(']') and part[1:-1].isdigit()):
-                # Clean up compound names like "hostnames[0]" to just "hostnames"
-                clean_part = re.sub(r'\[\d+\]$', '', part)
-                if clean_part:
-                    return clean_part
-        
-        # Fallback to parent attribute if available
-        if parent_attr:
-            return parent_attr
-        
-        # Last resort - use the first meaningful part
-        for part in path_parts:
-            if not (part.startswith('[') and part.endswith(']') and part[1:-1].isdigit()):
-                clean_part = re.sub(r'\[\d+\]$', '', part)
-                if clean_part:
-                    return clean_part
-        
-        return "correlation"
-
-    def _check_for_correlations(self, new_node_id: str, data: Any, path: List[str] = [], parent_attr: str = "") -> List[Dict]:
-        """Recursively traverse metadata to find correlations with existing data."""
-        if path is None:
-            path = []
-
-        all_correlations = []
-        if isinstance(data, dict):
-            for key, value in data.items():
-                if key == 'source':  # Avoid correlating on the provider name
-                    continue
-                all_correlations.extend(self._check_for_correlations(new_node_id, value, path + [key], key))
-        elif isinstance(data, list):
-            for i, item in enumerate(data):
-                list_path_component = f"[{i}]" if not parent_attr else f"{parent_attr}[{i}]"
-                all_correlations.extend(self._check_for_correlations(new_node_id, item, path + [list_path_component], parent_attr))
-        else:
-            value = data
-            if value in self.correlation_index:
-                existing_nodes_with_paths = self.correlation_index[value]
-                unique_nodes = set(existing_nodes_with_paths.keys())
-                unique_nodes.add(new_node_id)
-
-                if len(unique_nodes) < 2:
-                    return all_correlations # Correlation must involve at least two distinct nodes
-
-                new_source = {
-                    'node_id': new_node_id, 
-                    'path': ".".join(path),
-                    'parent_attr': parent_attr,
-                    'meaningful_attr': self._extract_meaningful_attribute(".".join(path), parent_attr)
-                }
-                all_sources = [new_source]
+            if isinstance(attr_value, bool):
+                continue
                
-                for node_id, path_entries in existing_nodes_with_paths.items():
-                    for entry in path_entries:
-                        if isinstance(entry, dict):
-                            all_sources.append({
-                                'node_id': node_id,
-                                'path': entry['path'],
-                                'parent_attr': entry.get('parent_attr', ''),
-                                'meaningful_attr': entry.get('meaningful_attr', self._extract_meaningful_attribute(entry['path'], entry.get('parent_attr', '')))
-                            })
-                        else:
-                            # Handle legacy string-only entries
-                            all_sources.append({
-                                'node_id': node_id,
-                                'path': str(entry),
-                                'parent_attr': '',
-                                'meaningful_attr': self._extract_meaningful_attribute(str(entry))
-                            })
+            if isinstance(attr_value, str) and (len(attr_value) < 4 or self.date_pattern.match(attr_value)):
+                continue
+
+            if attr_value not in self.correlation_index:
+                self.correlation_index[attr_value] = set()
+
+            self.correlation_index[attr_value].add(node_id)
+
+            if len(self.correlation_index[attr_value]) > 1:
+                self._create_correlation_node_and_edges(attr_value, self.correlation_index[attr_value])
+
+    def _create_correlation_node_and_edges(self, value, nodes):
+        """Create a correlation node and edges to the correlated nodes."""
+        correlation_node_id = f"corr_{value}"
+        if not self.graph.has_node(correlation_node_id):
+            self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT,
+                        metadata={'value': value, 'correlated_nodes': list(nodes)})
+
+        for node_id in nodes:
+            if self.graph.has_node(node_id) and not self.graph.has_edge(node_id, correlation_node_id):
+                self.add_edge(node_id, correlation_node_id, "correlation", confidence_score=0.9)

-                all_correlations.append({
-                    'value': value,
-                    'sources': all_sources,
-                    'nodes': list(unique_nodes)
-                })
-        return all_correlations

    def add_node(self, node_id: str, node_type: NodeType, attributes: Optional[List[Dict[str, Any]]] = None,
                description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
@@ -232,78 +132,9 @@ class GraphManager:
                existing_metadata.update(metadata)
                self.graph.nodes[node_id]['metadata'] = existing_metadata

-        if attributes and node_type != NodeType.CORRELATION_OBJECT:
-            correlations = self._check_for_correlations(node_id, attributes)
-            for corr in correlations:
-                value = corr['value']
-                
-                # STEP 1: Substring check against all existing nodes
-                if self._correlation_value_matches_existing_node(value):
-                    # Skip creating correlation node - would be redundant
-                    continue
-                
-                eligible_nodes = set(corr['nodes'])
-                
-                if len(eligible_nodes) < 2:
-                    # Need at least 2 nodes to create a correlation
-                    continue
-                    
-                # STEP 3: Check for existing correlation node with same connection pattern
-                correlation_nodes_with_pattern = self._find_correlation_nodes_with_same_pattern(eligible_nodes)
-                
-                if correlation_nodes_with_pattern:
-                    # STEP 4: Merge with existing correlation node
-                    target_correlation_node = correlation_nodes_with_pattern[0]
-                    self._merge_correlation_values(target_correlation_node, value, corr)
-                else:
-                    # STEP 5: Create new correlation node for eligible nodes only
-                    correlation_node_id = f"corr_{abs(hash(str(sorted(eligible_nodes))))}"
-                    self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT,
-                                metadata={'values': [value], 'sources': corr['sources'],
-                                            'correlated_nodes': list(eligible_nodes)})
-                    
-                    # Create edges from eligible nodes to this correlation node with better labeling
-                    for c_node_id in eligible_nodes:
-                        if self.graph.has_node(c_node_id):
-                            # Find the best attribute name for this node
-                            meaningful_attr = self._find_best_attribute_name_for_node(c_node_id, corr['sources'])
-                            relationship_type = f"c_{meaningful_attr}"
-                            self.add_edge(c_node_id, correlation_node_id, relationship_type, confidence_score=0.9)
-
-            self._update_correlation_index(node_id, attributes)
-
        self.last_modified = datetime.now(timezone.utc).isoformat()
        return is_new_node

-    def _find_best_attribute_name_for_node(self, node_id: str, sources: List[Dict]) -> str:
-        """Find the best attribute name for a correlation edge by looking at the sources."""
-        node_sources = [s for s in sources if s['node_id'] == node_id]
-        
-        if not node_sources:
-            return "correlation"
-        
-        # Use the meaningful_attr if available
-        for source in node_sources:
-            meaningful_attr = source.get('meaningful_attr')
-            if meaningful_attr and meaningful_attr != "unknown":
-                return meaningful_attr
-        
-        # Fallback to parent_attr
-        for source in node_sources:
-            parent_attr = source.get('parent_attr')
-            if parent_attr:
-                return parent_attr
-        
-        # Last resort - extract from path
-        for source in node_sources:
-            path = source.get('path', '')
-            if path:
-                extracted = self._extract_meaningful_attribute(path)
-                if extracted != "unknown":
-                    return extracted
-        
-        return "correlation"
-
    def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
        """
        Check if there's a direct edge between two nodes in either direction.
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -506,6 +506,7 @@ class Scanner:
                    large_entity_members.update(discovered)
                else:
                    new_targets.update(discovered)
+                self.graph.process_correlations_for_node(target)
            else:
                print(f"Stop requested after processing results from {provider.get_name()}")
        except Exception as e: