From 2c483164779a9686ed667d255f53e9ff66fca64e Mon Sep 17 00:00:00 2001
From: overcuriousity <overcuriousity@posteo.org>
Date: Tue, 16 Sep 2025 00:01:24 +0200
Subject: [PATCH] extract from node feature

---
 app.py                      | 27 ++++++++++++++
 core/graph_manager.py       | 24 +++++++++++++
 core/scanner.py             | 56 +++++++++++++++++++++++++++--
 providers/base_provider.py  |  2 --
 providers/crtsh_provider.py | 14 ++++++--
 static/js/graph.js          |  4 +--
 static/js/main.js           | 70 +++++++++++++++++++++++++++++++++----
 7 files changed, 182 insertions(+), 15 deletions(-)

diff --git a/app.py b/app.py
index aff5ee5..b2233ff 100644
--- a/app.py
+++ b/app.py
@@ -282,6 +282,33 @@ def get_graph_data():
             }
         }), 500
 
+@app.route('/api/graph/large-entity/extract', methods=['POST'])
+def extract_from_large_entity():
+    """Extract a node from a large entity, making it a standalone node."""
+    try:
+        data = request.get_json()
+        large_entity_id = data.get('large_entity_id')
+        node_id = data.get('node_id')
+
+        if not large_entity_id or not node_id:
+            return jsonify({'success': False, 'error': 'Missing required parameters'}), 400
+
+        user_session_id, scanner = get_user_scanner()
+        if not scanner:
+            return jsonify({'success': False, 'error': 'No active session found'}), 404
+
+        success = scanner.extract_node_from_large_entity(large_entity_id, node_id)
+        
+        if success:
+            session_manager.update_session_scanner(user_session_id, scanner)
+            return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'})
+        else:
+            return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500
+
+    except Exception as e:
+        print(f"ERROR: Exception in extract_from_large_entity endpoint: {e}")
+        traceback.print_exc()
+        return jsonify({'success': False, 'error': f'Internal server error: {str(e)}'}), 500
 
 @app.route('/api/graph/node/<node_id>', methods=['DELETE'])
 def delete_graph_node(node_id):
diff --git a/core/graph_manager.py b/core/graph_manager.py
index 558503e..6086ea8 100644
--- a/core/graph_manager.py
+++ b/core/graph_manager.py
@@ -413,6 +413,30 @@ class GraphManager:
                             raw_data=raw_data or {})
         self.last_modified = datetime.now(timezone.utc).isoformat()
         return True
+    
+    def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
+        """
+        Removes a node from a large entity's internal lists and updates its count.
+        This prepares the large entity for the node's promotion to a regular node.
+        """
+        if not self.graph.has_node(large_entity_id):
+            return False
+            
+        node_data = self.graph.nodes[large_entity_id]
+        attributes = node_data.get('attributes', {})
+        
+        # Remove from the list of member nodes
+        if 'nodes' in attributes and node_id_to_extract in attributes['nodes']:
+            attributes['nodes'].remove(node_id_to_extract)
+            # Update the count
+            attributes['count'] = len(attributes['nodes'])
+        else:
+            # This can happen if the node was already extracted, which is not an error.
+            print(f"Warning: Node {node_id_to_extract} not found in the 'nodes' list of {large_entity_id}.")
+            return True # Proceed as if successful
+            
+        self.last_modified = datetime.now(timezone.utc).isoformat()
+        return True
 
     def remove_node(self, node_id: str) -> bool:
         """Remove a node and its connected edges from the graph."""
diff --git a/core/scanner.py b/core/scanner.py
index fd7e4e6..b59059e 100644
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -162,12 +162,15 @@ class Scanner:
         self.stop_event = threading.Event()
         self.scan_thread = None
         self.executor = None
-        self.processing_lock = threading.Lock()  # **NEW**: Recreate processing lock
+        self.processing_lock = threading.Lock()
         self.task_queue = PriorityQueue()
         self.rate_limiter = GlobalRateLimiter(redis.StrictRedis(db=0))
         self.logger = get_forensic_logger()
+
+        # This ensures the scanner has access to providers for actions like node extraction.
+        print("Re-initializing providers after loading session...")
+        self._initialize_providers()
         
-        # **NEW**: Reset processing tracking
         if not hasattr(self, 'currently_processing'):
             self.currently_processing = set()
         
@@ -792,6 +795,7 @@ class Scanner:
             elif _is_valid_ip(targets[0]):
                 node_type = 'ip'
         
+        # We still create the nodes so they exist in the graph, they are just not processed for edges yet.
         for target in targets:
             self.graph.add_node(target, NodeType.DOMAIN if node_type == 'domain' else NodeType.IP)
 
@@ -802,6 +806,7 @@ class Scanner:
             'source_provider': provider_name,
             'discovery_depth': current_depth,
             'threshold_exceeded': self.config.large_entity_threshold,
+            # <<< FIX: Removed 'raw_results'. It's inefficient and unnecessary.
         }
         description = f'Large entity created due to {len(targets)} results from {provider_name}'
         
@@ -816,6 +821,53 @@ class Scanner:
         print(f"Created large entity {entity_id} for {len(targets)} {node_type}s from {provider_name}")
         
         return set(targets)
+    
+    def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
+        """
+        Extracts a node from a large entity by re-adding it to the main processing queue.
+        This is a much cleaner approach than storing and replaying raw results.
+        """
+        if not self.graph.graph.has_node(large_entity_id):
+            print(f"ERROR: Large entity {large_entity_id} not found.")
+            return False
+
+        # 1. Modify the graph data structure first
+        # This removes the node from the container's internal list.
+        success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
+        if not success:
+            print(f"ERROR: Node {node_id_to_extract} could not be removed from {large_entity_id}'s attributes.")
+            return False
+
+        # 2. Re-queue the extracted node for full processing by all eligible providers.
+        # This is the same logic used for any newly discovered node.
+        print(f"Re-queueing extracted node {node_id_to_extract} for full reconnaissance...")
+        is_ip = _is_valid_ip(node_id_to_extract)
+        current_depth = self.graph.graph.nodes[large_entity_id].get('attributes', {}).get('discovery_depth', 0)
+        
+        eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
+        for provider in eligible_providers:
+            provider_name = provider.get_name()
+            # Add the task to the main queue with the correct depth.
+            self.task_queue.put((self._get_priority(provider_name), (provider_name, node_id_to_extract, current_depth)))
+            self.total_tasks_ever_enqueued += 1
+
+        # 3. If the scanner is not running, we need to kickstart it to process this one item.
+        if self.status != ScanStatus.RUNNING:
+            print("Scanner is idle. Starting a mini-scan to process the extracted node.")
+            self.status = ScanStatus.RUNNING
+            self._update_session_state()
+            
+            # Start a new thread for the scan execution if one isn't running
+            if not self.scan_thread or not self.scan_thread.is_alive():
+                 self.scan_thread = threading.Thread(
+                    target=self._execute_scan,
+                    args=(self.current_target, self.max_depth), # Use existing target/depth
+                    daemon=True
+                )
+                 self.scan_thread.start()
+
+        print(f"Successfully extracted and re-queued {node_id_to_extract} from {large_entity_id}.")
+        return True
 
     def _collect_node_attributes(self, node_id: str, provider_name: str, rel_type: str,
                                     target: str, raw_data: Dict[str, Any], attributes: Dict[str, Any]) -> None:
diff --git a/providers/base_provider.py b/providers/base_provider.py
index b0ce905..7941fb6 100644
--- a/providers/base_provider.py
+++ b/providers/base_provider.py
@@ -3,7 +3,6 @@
 import time
 import requests
 import threading
-import redis
 from abc import ABC, abstractmethod
 from typing import List, Dict, Any, Optional, Tuple
 
@@ -36,7 +35,6 @@ class BaseProvider(ABC):
             # Fallback to global config for backwards compatibility
             from config import config as global_config
             self.config = global_config
-            actual_rate_limit = rate_limit
             actual_timeout = timeout
 
         self.name = name
diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py
index f70b09a..7b061a2 100644
--- a/providers/crtsh_provider.py
+++ b/providers/crtsh_provider.py
@@ -514,12 +514,20 @@ class CrtShProvider(BaseProvider):
         shared = []
         
         # Create a set of certificate IDs from the first list for quick lookup
-        cert1_ids = {cert.get('certificate_id') for cert in certs1 if cert.get('certificate_id')}
+        # <<< FIX: Added robust type checking to handle potentially malformed API data
+        cert1_ids = set()
+        for cert in certs1:
+            cert_id = cert.get('certificate_id')
+            # Ensure the ID is not None and is a hashable type before adding to the set
+            if cert_id and isinstance(cert_id, (int, str, float, bool, tuple)):
+                cert1_ids.add(cert_id)
         
         # Find certificates in the second list that match
         for cert in certs2:
-            if cert.get('certificate_id') in cert1_ids:
-                shared.append(cert)
+            cert_id = cert.get('certificate_id')
+            if cert_id and isinstance(cert_id, (int, str, float, bool, tuple)):
+                if cert_id in cert1_ids:
+                    shared.append(cert)
         
         return shared
 
diff --git a/static/js/graph.js b/static/js/graph.js
index 9a52f1e..07e4609 100644
--- a/static/js/graph.js
+++ b/static/js/graph.js
@@ -389,8 +389,8 @@ class GraphManager {
             });
 
             const filteredNodes = graphData.nodes.filter(node => {
-                // Only include nodes that are NOT members of large entities
-                return !this.largeEntityMembers.has(node.id);
+                // Only include nodes that are NOT members of large entities, but always include the container itself
+                return !this.largeEntityMembers.has(node.id) || node.type === 'large_entity';
             });
 
             console.log(`Filtered ${graphData.nodes.length - filteredNodes.length} large entity member nodes from visualization`);
diff --git a/static/js/main.js b/static/js/main.js
index 3a2ecdd..547b59d 100644
--- a/static/js/main.js
+++ b/static/js/main.js
@@ -189,7 +189,7 @@ class DNSReconApp {
                 this.elements.resetApiKeys.addEventListener('click', () => this.resetApiKeys());
             }
 
-            // ** FIX: Listen for the custom event from the graph **
+            // Listen for the custom event from the graph
             document.addEventListener('nodeSelected', (e) => {
                 this.showNodeModal(e.detail.node);
             });
@@ -1092,8 +1092,6 @@ class DNSReconApp {
         return html;
     }
 
-
-
     generateLargeEntityDetails(node) {
         const attributes = node.attributes || {};
         const nodes = attributes.nodes || [];
@@ -1123,16 +1121,23 @@ class DNSReconApp {
             </div>
             
             <div class="modal-section">
-                <details>
+                <details open>
                     <summary>📋 Contained ${nodeType}s (${nodes.length})</summary>
                     <div class="modal-section-content">
                         <div class="relationship-compact">
         `;
         
+        // Use node.id for the large_entity_id
+        const largeEntityId = node.id;
+
         nodes.forEach(innerNodeId => {
             html += `
                 <div class="relationship-compact-item">
                     <span class="node-link-compact" data-node-id="${innerNodeId}">${innerNodeId}</span>
+                    <button class="btn-icon-small extract-node-btn" 
+                            title="Extract to graph"
+                            data-large-entity-id="${largeEntityId}" 
+                            data-node-id="${innerNodeId}">[+]</button>
                 </div>
             `;
         });
@@ -1731,6 +1736,20 @@ class DNSReconApp {
                 }
             });
         });
+
+        // Handle the new extract button
+        this.elements.modalDetails.querySelectorAll('.extract-node-btn').forEach(button => {
+            button.addEventListener('click', (e) => {
+                e.preventDefault();
+                e.stopPropagation();
+                
+                const largeEntityId = e.target.dataset.largeEntityId;
+                const nodeId = e.target.dataset.nodeId;
+                
+                console.log(`Extract button clicked for node ${nodeId} from entity ${largeEntityId}`);
+                this.extractNode(largeEntityId, nodeId);
+            });
+        });
         
         // Handle legacy node links
         this.elements.modalDetails.querySelectorAll('.node-link').forEach(link => {
@@ -1749,17 +1768,56 @@ class DNSReconApp {
         });
     }
 
+    async extractNode(largeEntityId, nodeId) {
+        try {
+            this.showInfo(`Extraction initiated for ${nodeId}. It will be processed by the scanner.`);
+            const response = await this.apiCall('/api/graph/large-entity/extract', 'POST', {
+                large_entity_id: largeEntityId,
+                node_id: nodeId,
+            });
+
+            if (response.success) {
+                this.showSuccess(response.message);
+                
+                // The node is now in the queue. We don't need to force a graph update.
+                // Instead, we just need to update the modal view to show one less item.
+                const graphResponse = await this.apiCall('/api/graph');
+                if (graphResponse.success) {
+                    const updatedLargeEntity = graphResponse.graph.nodes.find(n => n.id === largeEntityId);
+                    if (updatedLargeEntity) {
+                        this.showNodeModal(updatedLargeEntity);
+                    } else {
+                        // The entity might have been dismantled completely if it was the last node
+                        this.hideModal();
+                    }
+                }
+
+                // If the scanner was idle, it's now running. Start polling.
+                if (this.scanStatus === 'idle') {
+                    this.startPolling(1000);
+                }
+
+            } else {
+                throw new Error(response.error || 'Extraction failed on the server.');
+            }
+        } catch (error) {
+            console.error('Failed to extract node:', error);
+            this.showError(`Extraction failed: ${error.message}`);
+        }
+    }
+
     initializeModalFunctionality() {
         // Make sure the graph manager has node access
         console.log('Initializing modal functionality...');
         
         // Set up event delegation for dynamic content
         document.addEventListener('click', (e) => {
-            if (e.target.classList.contains('node-link-compact') || e.target.classList.contains('node-link')) {
+            const target = e.target.closest('.node-link-compact, .node-link');
+            if (target) {
                 e.preventDefault();
                 e.stopPropagation();
                 
-                const nodeId = e.target.dataset.nodeId || e.target.getAttribute('data-node-id');
+                const nodeId = target.dataset.nodeId || target.getAttribute('data-node-id');
                 if (nodeId && this.graphManager && this.graphManager.nodes) {
                     const nextNode = this.graphManager.nodes.get(nodeId);
                     if (nextNode) {