From b2c5d2331c48e1b8a8cd4e71a3b932329ccdd3f5 Mon Sep 17 00:00:00 2001
From: overcuriousity <overcuriousity@posteo.org>
Date: Sat, 20 Sep 2025 20:56:31 +0200
Subject: [PATCH] work on large entity extraction

---
 app.py             |  59 +++++++++++++++--
 core/scanner.py    | 157 ++++++++++++++++++++++++++++++++++++++-------
 static/js/graph.js |  17 ++---
 static/js/main.js  |  69 ++++++++++++--------
 4 files changed, 241 insertions(+), 61 deletions(-)

diff --git a/app.py b/app.py
index 1dd1693..7ab2466 100644
--- a/app.py
+++ b/app.py
@@ -187,7 +187,9 @@ def get_graph_data():
 
 @app.route('/api/graph/large-entity/extract', methods=['POST'])
 def extract_from_large_entity():
-    """Extract a node from a large entity."""
+    """
+    FIXED: Extract a node from a large entity with proper error handling.
+    """
     try:
         data = request.get_json()
         large_entity_id = data.get('large_entity_id')
@@ -200,17 +202,66 @@ def extract_from_large_entity():
         if not scanner:
             return jsonify({'success': False, 'error': 'No active session found'}), 404
 
+        # FIXED: Check if node exists and provide better error messages
+        if not scanner.graph.graph.has_node(node_id):
+            return jsonify({
+                'success': False, 
+                'error': f'Node {node_id} not found in graph'
+            }), 404
+
+        # FIXED: Check if node is actually part of the large entity
+        node_data = scanner.graph.graph.nodes[node_id]
+        metadata = node_data.get('metadata', {})
+        current_large_entity = metadata.get('large_entity_id')
+        
+        if not current_large_entity:
+            return jsonify({
+                'success': False, 
+                'error': f'Node {node_id} is not part of any large entity'
+            }), 400
+            
+        if current_large_entity != large_entity_id:
+            return jsonify({
+                'success': False, 
+                'error': f'Node {node_id} belongs to large entity {current_large_entity}, not {large_entity_id}'
+            }), 400
+
+        # FIXED: Check if large entity exists
+        if not scanner.graph.graph.has_node(large_entity_id):
+            return jsonify({
+                'success': False, 
+                'error': f'Large entity {large_entity_id} not found'
+            }), 404
+
+        # Perform the extraction
         success = scanner.extract_node_from_large_entity(large_entity_id, node_id)
         
         if success:
+            # Force immediate session state update
             session_manager.update_session_scanner(user_session_id, scanner)
-            return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'})
+            
+            return jsonify({
+                'success': True, 
+                'message': f'Node {node_id} extracted successfully from {large_entity_id}.',
+                'extracted_node': node_id,
+                'large_entity': large_entity_id
+            })
         else:
-            return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500
+            # This should not happen with the improved checks above, but handle it gracefully
+            return jsonify({
+                'success': False, 
+                'error': f'Failed to extract node {node_id} from {large_entity_id}. Node may have already been extracted.'
+            }), 409
 
+    except json.JSONDecodeError:
+        return jsonify({'success': False, 'error': 'Invalid JSON in request body'}), 400
     except Exception as e:
         traceback.print_exc()
-        return jsonify({'success': False, 'error': f'Internal server error: {str(e)}'}), 500
+        return jsonify({
+            'success': False, 
+            'error': f'Internal server error: {str(e)}',
+            'error_type': type(e).__name__
+        }), 500
 
 @app.route('/api/graph/node/<node_id>', methods=['DELETE'])
 def delete_graph_node(node_id):
diff --git a/core/scanner.py b/core/scanner.py
index 5ebc1a4..236b7ca 100644
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -860,7 +860,7 @@ class Scanner:
 
     def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool:
         """
-        Removes a node from a large entity, allowing it to be processed normally.
+        FIXED: Extract a node from a large entity with proper backend updates and edge re-routing.
         """
         if not self.graph.graph.has_node(node_id):
             return False
@@ -868,31 +868,144 @@ class Scanner:
         node_data = self.graph.graph.nodes[node_id]
         metadata = node_data.get('metadata', {})
         
-        if metadata.get('large_entity_id') == large_entity_id:
-            # Remove the large entity tag
-            del metadata['large_entity_id']
-            self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata)
+        if metadata.get('large_entity_id') != large_entity_id:
+            return False
+        
+        # FIXED: Update the large entity's attributes to remove the extracted node
+        if self.graph.graph.has_node(large_entity_id):
+            le_node_data = self.graph.graph.nodes[large_entity_id]
+            le_attributes = le_node_data.get('attributes', [])
             
-            # Re-enqueue the node for full processing
-            is_ip = _is_valid_ip(node_id)
-            eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
-            for provider in eligible_providers:
-                provider_name = provider.get_name()
-                priority = self._get_priority(provider_name)
-                # Use current depth of the large entity if available, else 0
-                depth = 0
-                if self.graph.graph.has_node(large_entity_id):
-                    le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
-                    depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
-                    if depth_attr:
-                        depth = depth_attr['value']
+            # Update the 'nodes' attribute to remove extracted node
+            nodes_attr = next((attr for attr in le_attributes if attr['name'] == 'nodes'), None)
+            if nodes_attr and isinstance(nodes_attr['value'], list):
+                if node_id in nodes_attr['value']:
+                    nodes_attr['value'].remove(node_id)
+            
+            # Update the 'count' attribute
+            count_attr = next((attr for attr in le_attributes if attr['name'] == 'count'), None)
+            if count_attr and isinstance(count_attr['value'], (int, float)):
+                count_attr['value'] = max(0, count_attr['value'] - 1)
+            
+            # Update the large entity node
+            self.graph.add_node(
+                large_entity_id, 
+                NodeType.LARGE_ENTITY, 
+                attributes=le_attributes,
+                description=le_node_data.get('description', ''),
+                metadata=le_node_data.get('metadata', {})
+            )
+        
+        # Remove the large entity tag from extracted node
+        updated_metadata = metadata.copy()
+        del updated_metadata['large_entity_id']
+        
+        # Add extraction history for forensic integrity
+        extraction_record = {
+            'extracted_at': datetime.now(timezone.utc).isoformat(),
+            'extracted_from': large_entity_id,
+            'extraction_method': 'manual'
+        }
+        
+        if 'extraction_history' not in updated_metadata:
+            updated_metadata['extraction_history'] = []
+        updated_metadata['extraction_history'].append(extraction_record)
+        
+        # Update the extracted node
+        self.graph.add_node(node_id, NodeType(node_data['type']), metadata=updated_metadata)
+        
+        # FIXED: Re-route edges that were pointing to the large entity
+        self._reroute_large_entity_edges(large_entity_id, node_id)
+        
+        # Re-enqueue the node for full processing
+        is_ip = _is_valid_ip(node_id)
+        eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
+        for provider in eligible_providers:
+            provider_name = provider.get_name()
+            priority = self._get_priority(provider_name)
+            
+            # Use current depth of the large entity if available, else 0
+            depth = 0
+            if self.graph.graph.has_node(large_entity_id):
+                le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
+                depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
+                if depth_attr:
+                    depth = depth_attr['value']
 
-                self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
-                self.total_tasks_ever_enqueued += 1
+            self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
+            self.total_tasks_ever_enqueued += 1
+        
+        # Force session state update for immediate frontend sync
+        self._update_session_state()
+        
+        return True
+    
+    def _reroute_large_entity_edges(self, large_entity_id: str, extracted_node_id: str) -> None:
+        """
+        FIXED: Re-route edges from large entity to extracted node where appropriate.
+        """
+        if not self.graph.graph.has_node(large_entity_id) or not self.graph.graph.has_node(extracted_node_id):
+            return
+        
+        edges_to_reroute = []
+        
+        # Find edges pointing TO the large entity that should point to the extracted node
+        for source, target, edge_data in self.graph.graph.in_edges(large_entity_id, data=True):
+            # Check if this edge was originally meant for the extracted node
+            raw_data = edge_data.get('raw_data', {})
             
-            return True
+            # If the raw data suggests this edge was for the extracted node, re-route it
+            if (raw_data.get('original_target') == extracted_node_id or
+                self._should_reroute_edge(edge_data, extracted_node_id)):
+                edges_to_reroute.append(('in', source, target, edge_data))
+        
+        # Find edges pointing FROM the large entity that should point from the extracted node  
+        for source, target, edge_data in self.graph.graph.out_edges(large_entity_id, data=True):
+            raw_data = edge_data.get('raw_data', {})
             
-        return False
+            if (raw_data.get('original_source') == extracted_node_id or
+                self._should_reroute_edge(edge_data, extracted_node_id)):
+                edges_to_reroute.append(('out', source, target, edge_data))
+        
+        # Re-route the edges
+        for direction, source, target, edge_data in edges_to_reroute:
+            # Remove old edge
+            self.graph.graph.remove_edge(source, target)
+            
+            # Add new edge with extracted node
+            if direction == 'in':
+                new_target = extracted_node_id
+                new_source = source
+            else:  # direction == 'out'
+                new_source = extracted_node_id  
+                new_target = target
+            
+            # Add the re-routed edge
+            self.graph.add_edge(
+                source_id=new_source,
+                target_id=new_target,
+                relationship_type=edge_data.get('relationship_type', 'unknown'),
+                confidence_score=edge_data.get('confidence_score', 0.5),
+                source_provider=edge_data.get('source_provider', 'rerouted'),
+                raw_data=dict(edge_data.get('raw_data', {}), **{'rerouted_from_large_entity': large_entity_id})
+            )
+
+    def _should_reroute_edge(self, edge_data: dict, extracted_node_id: str) -> bool:
+        """
+        Determine if an edge should be re-routed to an extracted node.
+        This is a heuristic-based approach since we don't store original targets.
+        """
+        relationship_type = edge_data.get('relationship_type', '')
+        
+        # For now, re-route DNS and certificate-based relationships
+        # These are likely to be node-specific rather than entity-wide
+        reroutable_types = [
+            'dns_a_record', 'dns_aaaa_record', 'dns_cname_record', 
+            'dns_mx_record', 'dns_ptr_record',
+            'crtsh_san_certificate', 'crtsh_cert_issuer'
+        ]
+        
+        return any(rtype in relationship_type for rtype in reroutable_types)
 
     def _process_provider_result_unified(self, target: str, provider: BaseProvider,
                                         provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
diff --git a/static/js/graph.js b/static/js/graph.js
index 89efeae..d2b8ae6 100644
--- a/static/js/graph.js
+++ b/static/js/graph.js
@@ -353,9 +353,6 @@ class GraphManager {
         });
     }
 
-    /**
-     * @param {Object} graphData - Graph data from backend
-     */
     updateGraph(graphData) {
         if (!graphData || !graphData.nodes || !graphData.edges) {
             console.warn('Invalid graph data received');
@@ -382,16 +379,18 @@ class GraphManager {
 
             const nodeMap = new Map(graphData.nodes.map(node => [node.id, node]));
 
-            // Filter out hidden nodes before processing for rendering
-            const filteredNodes = graphData.nodes.filter(node => 
-                !(node.metadata && node.metadata.large_entity_id)
-            );
-
+            // FIXED: Process all nodes first, then apply hiding logic correctly
             const processedNodes = graphData.nodes.map(node => {
                 const processed = this.processNode(node);
+                
+                // FIXED: Only hide if node is still a large entity member
                 if (node.metadata && node.metadata.large_entity_id) {
                     processed.hidden = true;
+                } else {
+                    // FIXED: Ensure extracted nodes are visible
+                    processed.hidden = false;
                 }
+                
                 return processed;
             });
             
@@ -401,6 +400,7 @@ class GraphManager {
                 let fromId = edge.from;
                 let toId = edge.to;
 
+                // FIXED: Only re-route if nodes are STILL in large entities
                 if (fromNode && fromNode.metadata && fromNode.metadata.large_entity_id) {
                     fromId = fromNode.metadata.large_entity_id;
                 }
@@ -423,6 +423,7 @@ class GraphManager {
             const newNodes = processedNodes.filter(node => !existingNodeIds.includes(node.id));
             const newEdges = processedEdges.filter(edge => !existingEdgeIds.includes(edge.id));
 
+            // FIXED: Update all nodes to ensure extracted nodes become visible
             this.nodes.update(processedNodes);
             this.edges.update(processedEdges);
             
diff --git a/static/js/main.js b/static/js/main.js
index 1d4b63a..0721d53 100644
--- a/static/js/main.js
+++ b/static/js/main.js
@@ -2023,6 +2023,16 @@ class DNSReconApp {
 
     async extractNode(largeEntityId, nodeId) {
         try {
+            console.log(`Extracting node ${nodeId} from large entity ${largeEntityId}`);
+            
+            // Show immediate feedback
+            const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`);
+            if (button) {
+                const originalContent = button.innerHTML;
+                button.innerHTML = '[...]';
+                button.disabled = true;
+            }
+
             const response = await this.apiCall('/api/graph/large-entity/extract', 'POST', {
                 large_entity_id: largeEntityId,
                 node_id: nodeId,
@@ -2031,41 +2041,46 @@ class DNSReconApp {
             if (response.success) {
                 this.showSuccess(response.message);
 
-                // If the scanner was idle, it's now running. Start polling to see the new node appear.
-                if (this.scanStatus === 'idle') {
-                    this.startPolling(1000);
-                } else {
-                    // If already scanning, force a quick graph update to see the change sooner.
-                    setTimeout(() => this.updateGraph(), 500);
-                }
-
-                // Immediately update the modal view
-                if (this.graphManager) {
-                    const largeEntityNode = this.graphManager.nodes.get(largeEntityId);
-                    if (largeEntityNode && largeEntityNode.attributes) {
-
-                        // Find and update the 'nodes' attribute
-                        const nodesAttribute = largeEntityNode.attributes.find(attr => attr.name === 'nodes');
-                        if (nodesAttribute && Array.isArray(nodesAttribute.value)) {
-                            nodesAttribute.value = nodesAttribute.value.filter(id => id !== nodeId);
+                // FIXED: Don't update local modal data - let backend be source of truth
+                // Force immediate graph update to get fresh backend data
+                console.log('Extraction successful, updating graph with fresh backend data');
+                await this.updateGraph();
+                
+                // FIXED: Re-fetch graph data instead of manipulating local state
+                setTimeout(async () => {
+                    try {
+                        const graphResponse = await this.apiCall('/api/graph');
+                        if (graphResponse.success) {
+                            this.graphManager.updateGraph(graphResponse.graph);
+                            
+                            // Update modal with fresh data if still open
+                            if (this.elements.nodeModal && this.elements.nodeModal.style.display === 'block') {
+                                if (this.graphManager.nodes) {
+                                    const updatedLargeEntity = this.graphManager.nodes.get(largeEntityId);
+                                    if (updatedLargeEntity) {
+                                        this.showNodeModal(updatedLargeEntity);
+                                    }
+                                }
+                            }
                         }
-
-                        // Find and update the 'count' attribute
-                        const countAttribute = largeEntityNode.attributes.find(attr => attr.name === 'count');
-                        if (countAttribute) {
-                            countAttribute.value = (countAttribute.value || 0) - 1;
-                        }
-                        
-                        // Re-render the modal with the updated data
-                        this.showNodeModal(largeEntityNode);
+                    } catch (error) {
+                        console.error('Error refreshing graph after extraction:', error);
                     }
-                }
+                }, 100);
+
             } else {
                 throw new Error(response.error || 'Extraction failed on the server.');
             }
         } catch (error) {
             console.error('Failed to extract node:', error);
             this.showError(`Extraction failed: ${error.message}`);
+            
+            // Restore button state on error
+            const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`);
+            if (button) {
+                button.innerHTML = '[+]';
+                button.disabled = false;
+            }
         }
     }