extract from node feature

2025-09-16 00:01:24 +02:00
parent fc098aed28
commit 2c48316477
7 changed files with 182 additions and 15 deletions
--- a/core/graph_manager.py
+++ b/core/graph_manager.py
@@ -413,6 +413,30 @@ class GraphManager:
                            raw_data=raw_data or {})
        self.last_modified = datetime.now(timezone.utc).isoformat()
        return True
+    
+    def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
+        """
+        Removes a node from a large entity's internal lists and updates its count.
+        This prepares the large entity for the node's promotion to a regular node.
+        """
+        if not self.graph.has_node(large_entity_id):
+            return False
+            
+        node_data = self.graph.nodes[large_entity_id]
+        attributes = node_data.get('attributes', {})
+        
+        # Remove from the list of member nodes
+        if 'nodes' in attributes and node_id_to_extract in attributes['nodes']:
+            attributes['nodes'].remove(node_id_to_extract)
+            # Update the count
+            attributes['count'] = len(attributes['nodes'])
+        else:
+            # This can happen if the node was already extracted, which is not an error.
+            print(f"Warning: Node {node_id_to_extract} not found in the 'nodes' list of {large_entity_id}.")
+            return True # Proceed as if successful
+            
+        self.last_modified = datetime.now(timezone.utc).isoformat()
+        return True

    def remove_node(self, node_id: str) -> bool:
        """Remove a node and its connected edges from the graph."""
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -162,12 +162,15 @@ class Scanner:
        self.stop_event = threading.Event()
        self.scan_thread = None
        self.executor = None
-        self.processing_lock = threading.Lock()  # **NEW**: Recreate processing lock
+        self.processing_lock = threading.Lock()
        self.task_queue = PriorityQueue()
        self.rate_limiter = GlobalRateLimiter(redis.StrictRedis(db=0))
        self.logger = get_forensic_logger()
+
+        # This ensures the scanner has access to providers for actions like node extraction.
+        print("Re-initializing providers after loading session...")
+        self._initialize_providers()
        
-        # **NEW**: Reset processing tracking
        if not hasattr(self, 'currently_processing'):
            self.currently_processing = set()
        
@@ -792,6 +795,7 @@ class Scanner:
            elif _is_valid_ip(targets[0]):
                node_type = 'ip'
        
+        # We still create the nodes so they exist in the graph, they are just not processed for edges yet.
        for target in targets:
            self.graph.add_node(target, NodeType.DOMAIN if node_type == 'domain' else NodeType.IP)

@@ -802,6 +806,7 @@ class Scanner:
            'source_provider': provider_name,
            'discovery_depth': current_depth,
            'threshold_exceeded': self.config.large_entity_threshold,
+            # <<< FIX: Removed 'raw_results'. It's inefficient and unnecessary.
        }
        description = f'Large entity created due to {len(targets)} results from {provider_name}'
        
@@ -816,6 +821,53 @@ class Scanner:
        print(f"Created large entity {entity_id} for {len(targets)} {node_type}s from {provider_name}")
        
        return set(targets)
+    
+    def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
+        """
+        Extracts a node from a large entity by re-adding it to the main processing queue.
+        This is a much cleaner approach than storing and replaying raw results.
+        """
+        if not self.graph.graph.has_node(large_entity_id):
+            print(f"ERROR: Large entity {large_entity_id} not found.")
+            return False
+
+        # 1. Modify the graph data structure first
+        # This removes the node from the container's internal list.
+        success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
+        if not success:
+            print(f"ERROR: Node {node_id_to_extract} could not be removed from {large_entity_id}'s attributes.")
+            return False
+
+        # 2. Re-queue the extracted node for full processing by all eligible providers.
+        # This is the same logic used for any newly discovered node.
+        print(f"Re-queueing extracted node {node_id_to_extract} for full reconnaissance...")
+        is_ip = _is_valid_ip(node_id_to_extract)
+        current_depth = self.graph.graph.nodes[large_entity_id].get('attributes', {}).get('discovery_depth', 0)
+        
+        eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
+        for provider in eligible_providers:
+            provider_name = provider.get_name()
+            # Add the task to the main queue with the correct depth.
+            self.task_queue.put((self._get_priority(provider_name), (provider_name, node_id_to_extract, current_depth)))
+            self.total_tasks_ever_enqueued += 1
+
+        # 3. If the scanner is not running, we need to kickstart it to process this one item.
+        if self.status != ScanStatus.RUNNING:
+            print("Scanner is idle. Starting a mini-scan to process the extracted node.")
+            self.status = ScanStatus.RUNNING
+            self._update_session_state()
+            
+            # Start a new thread for the scan execution if one isn't running
+            if not self.scan_thread or not self.scan_thread.is_alive():
+                 self.scan_thread = threading.Thread(
+                    target=self._execute_scan,
+                    args=(self.current_target, self.max_depth), # Use existing target/depth
+                    daemon=True
+                )
+                 self.scan_thread.start()
+
+        print(f"Successfully extracted and re-queued {node_id_to_extract} from {large_entity_id}.")
+        return True

    def _collect_node_attributes(self, node_id: str, provider_name: str, rel_type: str,
                                    target: str, raw_data: Dict[str, Any], attributes: Dict[str, Any]) -> None: