UX improvements

2025-09-17 21:12:11 +02:00
parent d0ee415f0d
commit 8ae4fdbf80
11 changed files with 905 additions and 360 deletions
--- a/core/graph_manager.py
+++ b/core/graph_manager.py
@@ -4,7 +4,7 @@
 Graph data model for DNSRecon using NetworkX.
 Manages in-memory graph storage with confidence scoring and forensic metadata.
 Now fully compatible with the unified ProviderResult data model.
-UPDATED: Fixed certificate styling and correlation edge labeling.
+UPDATED: Fixed correlation exclusion keys to match actual attribute names.
 """
 import re
 from datetime import datetime, timezone
@@ -41,7 +41,30 @@ class GraphManager:
        self.correlation_index = {}
        # Compile regex for date filtering for efficiency
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
-        self.EXCLUDED_KEYS = ['crtsh_cert_validity_period_days','crtsh_cert_source','crtsh_cert_common_name']
+        
+        # These are the actual attribute names created in providers, WITHOUT provider prefix
+        self.EXCLUDED_KEYS = [
+            # Certificate metadata that creates noise
+            'cert_source',                    # Always 'crtsh' for crtsh provider
+            'cert_common_name',
+            'cert_validity_period_days',      # Numerical, not useful for correlation
+            #'cert_certificate_id',            # Unique per certificate
+            #'cert_serial_number',            # Unique per certificate
+            'cert_entry_timestamp',          # Timestamp, filtered by date regex anyway
+            'cert_not_before',               # Date, filtered by date regex anyway
+            'cert_not_after',                # Date, filtered by date regex anyway
+            # DNS metadata that creates noise
+            'dns_ttl',                       # TTL values are not meaningful for correlation
+            # Shodan metadata that might create noise
+            'timestamp',                     # Generic timestamp fields
+            'last_update',                   # Generic timestamp fields
+            #'org',                          # Too generic, causes false correlations
+            #'isp',                          # Too generic, causes false correlations
+            # Generic noisy attributes
+            'updated_timestamp',             # Any timestamp field
+            'discovery_timestamp',           # Any timestamp field
+            'query_timestamp',               # Any timestamp field
+        ]

    def __getstate__(self):
        """Prepare GraphManager for pickling, excluding compiled regex."""
@@ -72,14 +95,31 @@ class GraphManager:
            attr_value = attr.get('value')
            attr_provider = attr.get('provider', 'unknown')

-            # Skip excluded attributes and invalid values
-            if any(excluded_key in attr_name for excluded_key in self.EXCLUDED_KEYS) or not isinstance(attr_value, (str, int, float, bool)) or attr_value is None:
-                continue
+            # IMPROVED: More comprehensive exclusion logic
+            should_exclude = (
+                # Check against excluded keys (exact match or substring)
+                any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS) or
+                # Invalid value types
+                not isinstance(attr_value, (str, int, float, bool)) or 
+                attr_value is None or
+                # Boolean values are not useful for correlation
+                isinstance(attr_value, bool) or
+                # String values that are too short or are dates
+                (isinstance(attr_value, str) and (
+                    len(attr_value) < 4 or 
+                    self.date_pattern.match(attr_value) or
+                    # Exclude common generic values that create noise
+                    attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']
+                )) or
+                # Numerical values that are likely to be unique identifiers
+                (isinstance(attr_value, (int, float)) and (
+                    attr_value == 0 or  # Zero values are not meaningful
+                    attr_value == 1 or  # One values are too common
+                    abs(attr_value) > 1000000  # Very large numbers are likely IDs
+                ))
+            )

-            if isinstance(attr_value, bool):
-                continue
-                
-            if isinstance(attr_value, str) and (len(attr_value) < 4 or self.date_pattern.match(attr_value)):
+            if should_exclude:
                continue

            # Initialize correlation tracking for this value
@@ -149,7 +189,7 @@ class GraphManager:
            
            if self.graph.has_node(node_id) and not self.graph.has_edge(node_id, correlation_node_id):
                # Format relationship label as "corr_provider_attribute"
-                relationship_label = f"{provider}_{attribute}"
+                relationship_label = f"corr_{provider}_{attribute}"
                
                self.add_edge(
                    source_id=node_id,
@@ -170,7 +210,7 @@ class GraphManager:
    def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
        """
        Check if there's a direct edge between two nodes in either direction.
-        Returns True if node_a→node_b OR node_b→node_a exists.
+        Returns True if node_aâ†'node_b OR node_bâ†'node_a exists.
        """
        return (self.graph.has_edge(node_a, node_b) or 
                self.graph.has_edge(node_b, node_a))
@@ -410,12 +450,6 @@ class GraphManager:
        """Get all nodes of a specific type."""
        return [n for n, d in self.graph.nodes(data=True) if d.get('type') == node_type.value]

-    def get_neighbors(self, node_id: str) -> List[str]:
-        """Get all unique neighbors (predecessors and successors) for a node."""
-        if not self.graph.has_node(node_id):
-            return []
-        return list(set(self.graph.predecessors(node_id)) | set(self.graph.successors(node_id)))
-
    def get_high_confidence_edges(self, min_confidence: float = 0.8) -> List[Tuple[str, str, Dict]]:
        """Get edges with confidence score above a given threshold."""
        return [(u, v, d) for u, v, d in self.graph.edges(data=True)
--- a/core/provider_result.py
+++ b/core/provider_result.py
@@ -101,6 +101,7 @@ class ProviderResult:
        """Get the total number of attributes in this result."""
        return len(self.attributes)

-    def is_large_entity(self, threshold: int) -> bool:
-        """Check if this result qualifies as a large entity based on relationship count."""
-        return self.get_relationship_count() > threshold
+    ##TODO
+    #def is_large_entity(self, threshold: int) -> bool:
+    #    """Check if this result qualifies as a large entity based on relationship count."""
+    #    return self.get_relationship_count() > threshold
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -370,6 +370,7 @@ class Scanner:
                task_tuple = (provider_name, target_item)
                if task_tuple in processed_tasks:
                    self.tasks_skipped += 1
+                    self.indicators_completed +=1
                    continue

                if depth > max_depth:
@@ -405,7 +406,7 @@ class Scanner:
                            if self.target_retries[task_tuple] <= self.config.max_retries_per_target:
                                self.task_queue.put((priority, (provider_name, target_item, depth)))
                                self.tasks_re_enqueued += 1
-                                self.total_tasks_ever_enqueued += 1
+                                #self.total_tasks_ever_enqueued += 1
                            else:
                                self.scan_failed_due_to_retries = True
                                self._log_target_processing_error(str(task_tuple), "Max retries exceeded")
--- a/core/session_manager.py
+++ b/core/session_manager.py
@@ -108,64 +108,6 @@ class SessionManager:
                print(f"ERROR: Failed to create session {session_id}: {e}")
                raise

-    def clone_session_preserving_config(self, source_session_id: str) -> str:
-        """
-        FIXED: Create a new session that preserves the configuration (including API keys) from an existing session.
-        This is used when we need a fresh scanner but want to keep user configuration.
-        """
-        with self.creation_lock:
-            print(f"=== CLONING SESSION {source_session_id} (PRESERVING CONFIG) ===")
-            
-            try:
-                # Get the source session data
-                source_session_data = self._get_session_data(source_session_id)
-                if not source_session_data:
-                    print(f"ERROR: Source session {source_session_id} not found for cloning")
-                    return self.create_session()  # Fallback to new session
-                
-                # Create new session ID
-                new_session_id = str(uuid.uuid4())
-                
-                # Get the preserved configuration
-                preserved_config = source_session_data.get('config')
-                if not preserved_config:
-                    print(f"WARNING: No config found in source session, creating new")
-                    from core.session_config import create_session_config
-                    preserved_config = create_session_config()
-                
-                print(f"Preserving config with API keys: {list(preserved_config.api_keys.keys())}")
-                
-                # Create new scanner with preserved config
-                new_scanner = Scanner(session_config=preserved_config)
-                new_scanner.session_id = new_session_id
-                
-                
-                new_session_data = {
-                    'scanner': new_scanner,
-                    'config': preserved_config,
-                    'created_at': time.time(),
-                    'last_activity': time.time(),
-                    'status': 'active',
-                    'cloned_from': source_session_id
-                }
-                
-                # Store in Redis
-                serialized_data = pickle.dumps(new_session_data)
-                session_key = self._get_session_key(new_session_id)
-                self.redis_client.setex(session_key, self.session_timeout, serialized_data)
-                
-                # Initialize stop signal
-                stop_key = self._get_stop_signal_key(new_session_id)
-                self.redis_client.setex(stop_key, self.session_timeout, b'0')
-                
-                print(f"Cloned session {new_session_id} with preserved configuration")
-                return new_session_id
-                
-            except Exception as e:
-                print(f"ERROR: Failed to clone session {source_session_id}: {e}")
-                # Fallback to creating a new session
-                return self.create_session()
-
    def set_stop_signal(self, session_id: str) -> bool:
        """
        Set the stop signal for a session (cross-process safe).