refinements for correlations running logic

2025-09-20 20:31:56 +02:00
parent 4a82c279ef
commit 602739246f
2 changed files with 206 additions and 69 deletions
--- a/providers/correlation_provider.py
+++ b/providers/correlation_provider.py
@@ -78,90 +78,166 @@ class CorrelationProvider(BaseProvider):

    def _find_correlations(self, node_id: str) -> ProviderResult:
        """
-        Find correlations for a given node.
+        Find correlations for a given node with enhanced filtering and error handling.
        """
        result = ProviderResult()
-        # FIXED: Ensure self.graph is not None before proceeding.
+        
+        # Enhanced safety checks
        if not self.graph or not self.graph.graph.has_node(node_id):
            return result

-        node_attributes = self.graph.graph.nodes[node_id].get('attributes', [])
+        try:
+            node_attributes = self.graph.graph.nodes[node_id].get('attributes', [])
+            
+            # Ensure attributes is a list (handle legacy data)
+            if not isinstance(node_attributes, list):
+                return result
+                
+            correlations_found = 0
+            
+            for attr in node_attributes:
+                if not isinstance(attr, dict):
+                    continue
+                    
+                attr_name = attr.get('name', '')
+                attr_value = attr.get('value')
+                attr_provider = attr.get('provider', 'unknown')

-        for attr in node_attributes:
-            attr_name = attr.get('name')
-            attr_value = attr.get('value')
-            attr_provider = attr.get('provider', 'unknown')
+                # Enhanced filtering logic
+                should_exclude = self._should_exclude_attribute(attr_name, attr_value)
+                
+                if should_exclude:
+                    continue

-            should_exclude = (
-                any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS) or
-                not isinstance(attr_value, (str, int, float, bool)) or
-                attr_value is None or
-                isinstance(attr_value, bool) or
-                (isinstance(attr_value, str) and (
-                    len(attr_value) < 4 or
-                    self.date_pattern.match(attr_value) or
-                    attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']
-                )) or
-                (isinstance(attr_value, (int, float)) and (
-                    attr_value == 0 or
-                    attr_value == 1 or
-                    abs(attr_value) > 1000000
-                ))
-            )
+                # Build correlation index
+                if attr_value not in self.correlation_index:
+                    self.correlation_index[attr_value] = {
+                        'nodes': set(),
+                        'sources': []
+                    }

-            if should_exclude:
-                continue
+                self.correlation_index[attr_value]['nodes'].add(node_id)

-            if attr_value not in self.correlation_index:
-                self.correlation_index[attr_value] = {
-                    'nodes': set(),
-                    'sources': []
+                source_info = {
+                    'node_id': node_id,
+                    'provider': attr_provider,
+                    'attribute': attr_name,
+                    'path': f"{attr_provider}_{attr_name}"
                }

-            self.correlation_index[attr_value]['nodes'].add(node_id)
+                # Avoid duplicate sources
+                existing_sources = [s for s in self.correlation_index[attr_value]['sources']
+                                if s['node_id'] == node_id and s['path'] == source_info['path']]
+                if not existing_sources:
+                    self.correlation_index[attr_value]['sources'].append(source_info)

-            source_info = {
-                'node_id': node_id,
-                'provider': attr_provider,
-                'attribute': attr_name,
-                'path': f"{attr_provider}_{attr_name}"
-            }
-
-            existing_sources = [s for s in self.correlation_index[attr_value]['sources']
-                              if s['node_id'] == node_id and s['path'] == source_info['path']]
-            if not existing_sources:
-                self.correlation_index[attr_value]['sources'].append(source_info)
-
-            if len(self.correlation_index[attr_value]['nodes']) > 1:
-                self._create_correlation_relationships(attr_value, self.correlation_index[attr_value], result)
+                # Create correlation if we have multiple nodes with this value
+                if len(self.correlation_index[attr_value]['nodes']) > 1:
+                    self._create_correlation_relationships(attr_value, self.correlation_index[attr_value], result)
+                    correlations_found += 1
+                    
+            # Log correlation results
+            if correlations_found > 0:
+                self.logger.logger.info(f"Found {correlations_found} correlations for node {node_id}")
+                
+        except Exception as e:
+            self.logger.logger.error(f"Error finding correlations for {node_id}: {e}")
+            
        return result
-
+    
+    def _should_exclude_attribute(self, attr_name: str, attr_value: Any) -> bool:
+        """
+        Enhanced logic to determine if an attribute should be excluded from correlation.
+        """
+        # Check against excluded keys (exact match or substring)
+        if any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS):
+            return True
+        
+        # Value type filtering
+        if not isinstance(attr_value, (str, int, float, bool)) or attr_value is None:
+            return True
+        
+        # Boolean values are not useful for correlation
+        if isinstance(attr_value, bool):
+            return True
+            
+        # String value filtering
+        if isinstance(attr_value, str):                
+            # Date/timestamp strings
+            if self.date_pattern.match(attr_value):
+                return True
+                
+            # Common non-useful values
+            if attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']:
+                return True
+                
+            # Very long strings that are likely unique (> 100 chars)
+            if len(attr_value) > 100:
+                return True
+        
+        # Numeric value filtering  
+        if isinstance(attr_value, (int, float)):
+            # Very common values
+            if attr_value in [0, 1]:
+                return True
+                
+            # Very large numbers (likely timestamps or unique IDs)
+            if abs(attr_value) > 1000000:
+                return True
+        
+        return False
+    
    def _create_correlation_relationships(self, value: Any, correlation_data: Dict[str, Any], result: ProviderResult):
        """
-        Create correlation relationships and add them to the provider result.
+        Create correlation relationships with enhanced deduplication and validation.
        """
        correlation_node_id = f"corr_{hash(str(value)) & 0x7FFFFFFF}"
        nodes = correlation_data['nodes']
        sources = correlation_data['sources']
+        
+        # Only create correlations if we have meaningful nodes (more than 1)
+        if len(nodes) < 2:
+            return
+            
+        # Limit correlation size to prevent overly large correlation objects
+        MAX_CORRELATION_SIZE = 50
+        if len(nodes) > MAX_CORRELATION_SIZE:
+            # Sample the nodes to keep correlation manageable
+            import random
+            sampled_nodes = random.sample(list(nodes), MAX_CORRELATION_SIZE)
+            nodes = set(sampled_nodes)
+            # Filter sources to match sampled nodes
+            sources = [s for s in sources if s['node_id'] in nodes]

        # Add the correlation node as an attribute to the result
        result.add_attribute(
            target_node=correlation_node_id,
            name="correlation_value",
            value=value,
-            attr_type=str(type(value)),
+            attr_type=str(type(value).__name__),
            provider=self.name,
            confidence=0.9,
            metadata={
                'correlated_nodes': list(nodes),
                'sources': sources,
+                'correlation_size': len(nodes),
+                'value_type': type(value).__name__
            }
        )

+        # Create relationships with source validation
+        created_relationships = set()
+        
        for source in sources:
            node_id = source['node_id']
            provider = source['provider']
            attribute = source['attribute']
+            
+            # Skip if we've already created this relationship
+            relationship_key = (node_id, correlation_node_id)
+            if relationship_key in created_relationships:
+                continue
+                
            relationship_label = f"corr_{provider}_{attribute}"

            # Add the relationship to the result
@@ -174,6 +250,9 @@ class CorrelationProvider(BaseProvider):
                raw_data={
                    'correlation_value': value,
                    'original_attribute': attribute,
-                    'correlation_type': 'attribute_matching'
+                    'correlation_type': 'attribute_matching',
+                    'correlation_size': len(nodes)
                }
-            )
+            )
+            
+            created_relationships.add(relationship_key)