work on large entity extraction

2025-09-20 20:56:31 +02:00
parent 602739246f
commit b2c5d2331c
4 changed files with 241 additions and 61 deletions
--- a/app.py
+++ b/app.py
@@ -187,7 +187,9 @@ def get_graph_data():
@app.route('/api/graph/large-entity/extract', methods=['POST'])
 def extract_from_large_entity():
-    """Extract a node from a large entity."""
+    """
    FIXED: Extract a node from a large entity with proper error handling.
    """
    try:
        data = request.get_json()
        large_entity_id = data.get('large_entity_id')
@@ -200,17 +202,66 @@ def extract_from_large_entity():
        if not scanner:
            return jsonify({'success': False, 'error': 'No active session found'}), 404
        # FIXED: Check if node exists and provide better error messages
        if not scanner.graph.graph.has_node(node_id):
            return jsonify({
                'success': False, 
                'error': f'Node {node_id} not found in graph'
            }), 404
        # FIXED: Check if node is actually part of the large entity
        node_data = scanner.graph.graph.nodes[node_id]
        metadata = node_data.get('metadata', {})
        current_large_entity = metadata.get('large_entity_id')
        if not current_large_entity:
            return jsonify({
                'success': False, 
                'error': f'Node {node_id} is not part of any large entity'
            }), 400
        if current_large_entity != large_entity_id:
            return jsonify({
                'success': False, 
                'error': f'Node {node_id} belongs to large entity {current_large_entity}, not {large_entity_id}'
            }), 400
        # FIXED: Check if large entity exists
        if not scanner.graph.graph.has_node(large_entity_id):
            return jsonify({
                'success': False, 
                'error': f'Large entity {large_entity_id} not found'
            }), 404
        # Perform the extraction
        success = scanner.extract_node_from_large_entity(large_entity_id, node_id)
        if success:
            # Force immediate session state update
            session_manager.update_session_scanner(user_session_id, scanner)
-            return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'})
+            
            return jsonify({
                'success': True, 
                'message': f'Node {node_id} extracted successfully from {large_entity_id}.',
                'extracted_node': node_id,
                'large_entity': large_entity_id
            })
        else:
-            return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500
+            # This should not happen with the improved checks above, but handle it gracefully
            return jsonify({
                'success': False, 
                'error': f'Failed to extract node {node_id} from {large_entity_id}. Node may have already been extracted.'
            }), 409
    except json.JSONDecodeError:
        return jsonify({'success': False, 'error': 'Invalid JSON in request body'}), 400
    except Exception as e:
        traceback.print_exc()
-        return jsonify({'success': False, 'error': f'Internal server error: {str(e)}'}), 500
+        return jsonify({
            'success': False, 
            'error': f'Internal server error: {str(e)}',
            'error_type': type(e).__name__
        }), 500
@app.route('/api/graph/node/<node_id>', methods=['DELETE'])
 def delete_graph_node(node_id):
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -860,7 +860,7 @@ class Scanner:
    def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool:
        """
-        Removes a node from a large entity, allowing it to be processed normally.
+        FIXED: Extract a node from a large entity with proper backend updates and edge re-routing.
        """
        if not self.graph.graph.has_node(node_id):
            return False
@@ -868,31 +868,144 @@ class Scanner:
        node_data = self.graph.graph.nodes[node_id]
        metadata = node_data.get('metadata', {})
-        if metadata.get('large_entity_id') == large_entity_id:
+        if metadata.get('large_entity_id') != large_entity_id:
-            # Remove the large entity tag
+            return False
-            del metadata['large_entity_id']
+        
-            self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata)
+        # FIXED: Update the large entity's attributes to remove the extracted node
        if self.graph.graph.has_node(large_entity_id):
            le_node_data = self.graph.graph.nodes[large_entity_id]
            le_attributes = le_node_data.get('attributes', [])
-            # Re-enqueue the node for full processing
+            # Update the 'nodes' attribute to remove extracted node
-            is_ip = _is_valid_ip(node_id)
+            nodes_attr = next((attr for attr in le_attributes if attr['name'] == 'nodes'), None)
-            eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
+            if nodes_attr and isinstance(nodes_attr['value'], list):
-            for provider in eligible_providers:
+                if node_id in nodes_attr['value']:
-                provider_name = provider.get_name()
+                    nodes_attr['value'].remove(node_id)
-                priority = self._get_priority(provider_name)
+            
-                # Use current depth of the large entity if available, else 0
+            # Update the 'count' attribute
-                depth = 0
+            count_attr = next((attr for attr in le_attributes if attr['name'] == 'count'), None)
-                if self.graph.graph.has_node(large_entity_id):
+            if count_attr and isinstance(count_attr['value'], (int, float)):
-                    le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
+                count_attr['value'] = max(0, count_attr['value'] - 1)
-                    depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
+            
-                    if depth_attr:
+            # Update the large entity node
-                        depth = depth_attr['value']
+            self.graph.add_node(
                large_entity_id, 
                NodeType.LARGE_ENTITY, 
                attributes=le_attributes,
                description=le_node_data.get('description', ''),
                metadata=le_node_data.get('metadata', {})
            )
        # Remove the large entity tag from extracted node
        updated_metadata = metadata.copy()
        del updated_metadata['large_entity_id']
        # Add extraction history for forensic integrity
        extraction_record = {
            'extracted_at': datetime.now(timezone.utc).isoformat(),
            'extracted_from': large_entity_id,
            'extraction_method': 'manual'
        }
        if 'extraction_history' not in updated_metadata:
            updated_metadata['extraction_history'] = []
        updated_metadata['extraction_history'].append(extraction_record)
        # Update the extracted node
        self.graph.add_node(node_id, NodeType(node_data['type']), metadata=updated_metadata)
        # FIXED: Re-route edges that were pointing to the large entity
        self._reroute_large_entity_edges(large_entity_id, node_id)
        # Re-enqueue the node for full processing
        is_ip = _is_valid_ip(node_id)
        eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
        for provider in eligible_providers:
            provider_name = provider.get_name()
            priority = self._get_priority(provider_name)
            # Use current depth of the large entity if available, else 0
            depth = 0
            if self.graph.graph.has_node(large_entity_id):
                le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
                depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
                if depth_attr:
                    depth = depth_attr['value']
-                self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
+            self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
-                self.total_tasks_ever_enqueued += 1
+            self.total_tasks_ever_enqueued += 1
        # Force session state update for immediate frontend sync
        self._update_session_state()
        return True
    def _reroute_large_entity_edges(self, large_entity_id: str, extracted_node_id: str) -> None:
        """
        FIXED: Re-route edges from large entity to extracted node where appropriate.
        """
        if not self.graph.graph.has_node(large_entity_id) or not self.graph.graph.has_node(extracted_node_id):
            return
        edges_to_reroute = []
        # Find edges pointing TO the large entity that should point to the extracted node
        for source, target, edge_data in self.graph.graph.in_edges(large_entity_id, data=True):
            # Check if this edge was originally meant for the extracted node
            raw_data = edge_data.get('raw_data', {})
-            return True
+            # If the raw data suggests this edge was for the extracted node, re-route it
            if (raw_data.get('original_target') == extracted_node_id or
                self._should_reroute_edge(edge_data, extracted_node_id)):
                edges_to_reroute.append(('in', source, target, edge_data))
        # Find edges pointing FROM the large entity that should point from the extracted node  
        for source, target, edge_data in self.graph.graph.out_edges(large_entity_id, data=True):
            raw_data = edge_data.get('raw_data', {})
-        return False
+            if (raw_data.get('original_source') == extracted_node_id or
                self._should_reroute_edge(edge_data, extracted_node_id)):
                edges_to_reroute.append(('out', source, target, edge_data))
        # Re-route the edges
        for direction, source, target, edge_data in edges_to_reroute:
            # Remove old edge
            self.graph.graph.remove_edge(source, target)
            # Add new edge with extracted node
            if direction == 'in':
                new_target = extracted_node_id
                new_source = source
            else:  # direction == 'out'
                new_source = extracted_node_id  
                new_target = target
            # Add the re-routed edge
            self.graph.add_edge(
                source_id=new_source,
                target_id=new_target,
                relationship_type=edge_data.get('relationship_type', 'unknown'),
                confidence_score=edge_data.get('confidence_score', 0.5),
                source_provider=edge_data.get('source_provider', 'rerouted'),
                raw_data=dict(edge_data.get('raw_data', {}), **{'rerouted_from_large_entity': large_entity_id})
            )
    def _should_reroute_edge(self, edge_data: dict, extracted_node_id: str) -> bool:
        """
        Determine if an edge should be re-routed to an extracted node.
        This is a heuristic-based approach since we don't store original targets.
        """
        relationship_type = edge_data.get('relationship_type', '')
        # For now, re-route DNS and certificate-based relationships
        # These are likely to be node-specific rather than entity-wide
        reroutable_types = [
            'dns_a_record', 'dns_aaaa_record', 'dns_cname_record', 
            'dns_mx_record', 'dns_ptr_record',
            'crtsh_san_certificate', 'crtsh_cert_issuer'
        ]
        return any(rtype in relationship_type for rtype in reroutable_types)
    def _process_provider_result_unified(self, target: str, provider: BaseProvider,
                                        provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
--- a/static/js/graph.js
+++ b/static/js/graph.js
@@ -353,9 +353,6 @@ class GraphManager {
        });
    }
    /**
     * @param {Object} graphData - Graph data from backend
     */
    updateGraph(graphData) {
        if (!graphData || !graphData.nodes || !graphData.edges) {
            console.warn('Invalid graph data received');
@@ -382,16 +379,18 @@ class GraphManager {
            const nodeMap = new Map(graphData.nodes.map(node => [node.id, node]));
-            // Filter out hidden nodes before processing for rendering
+            // FIXED: Process all nodes first, then apply hiding logic correctly
            const filteredNodes = graphData.nodes.filter(node => 
                !(node.metadata && node.metadata.large_entity_id)
            );
            const processedNodes = graphData.nodes.map(node => {
                const processed = this.processNode(node);
                // FIXED: Only hide if node is still a large entity member
                if (node.metadata && node.metadata.large_entity_id) {
                    processed.hidden = true;
                } else {
                    // FIXED: Ensure extracted nodes are visible
                    processed.hidden = false;
                }
                return processed;
            });
@@ -401,6 +400,7 @@ class GraphManager {
                let fromId = edge.from;
                let toId = edge.to;
                // FIXED: Only re-route if nodes are STILL in large entities
                if (fromNode && fromNode.metadata && fromNode.metadata.large_entity_id) {
                    fromId = fromNode.metadata.large_entity_id;
                }
@@ -423,6 +423,7 @@ class GraphManager {
            const newNodes = processedNodes.filter(node => !existingNodeIds.includes(node.id));
            const newEdges = processedEdges.filter(edge => !existingEdgeIds.includes(edge.id));
            // FIXED: Update all nodes to ensure extracted nodes become visible
            this.nodes.update(processedNodes);
            this.edges.update(processedEdges);
--- a/static/js/main.js
+++ b/static/js/main.js
@@ -2023,6 +2023,16 @@ class DNSReconApp {
    async extractNode(largeEntityId, nodeId) {
        try {
            console.log(`Extracting node ${nodeId} from large entity ${largeEntityId}`);
            // Show immediate feedback
            const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`);
            if (button) {
                const originalContent = button.innerHTML;
                button.innerHTML = '[...]';
                button.disabled = true;
            }
            const response = await this.apiCall('/api/graph/large-entity/extract', 'POST', {
                large_entity_id: largeEntityId,
                node_id: nodeId,
@@ -2031,41 +2041,46 @@ class DNSReconApp {
            if (response.success) {
                this.showSuccess(response.message);
-                // If the scanner was idle, it's now running. Start polling to see the new node appear.
+                // FIXED: Don't update local modal data - let backend be source of truth
-                if (this.scanStatus === 'idle') {
+                // Force immediate graph update to get fresh backend data
-                    this.startPolling(1000);
+                console.log('Extraction successful, updating graph with fresh backend data');
-                } else {
+                await this.updateGraph();
-                    // If already scanning, force a quick graph update to see the change sooner.
+                
-                    setTimeout(() => this.updateGraph(), 500);
+                // FIXED: Re-fetch graph data instead of manipulating local state
-                }
+                setTimeout(async () => {
-
+                    try {
-                // Immediately update the modal view
+                        const graphResponse = await this.apiCall('/api/graph');
-                if (this.graphManager) {
+                        if (graphResponse.success) {
-                    const largeEntityNode = this.graphManager.nodes.get(largeEntityId);
+                            this.graphManager.updateGraph(graphResponse.graph);
-                    if (largeEntityNode && largeEntityNode.attributes) {
+                            
-
+                            // Update modal with fresh data if still open
-                        // Find and update the 'nodes' attribute
+                            if (this.elements.nodeModal && this.elements.nodeModal.style.display === 'block') {
-                        const nodesAttribute = largeEntityNode.attributes.find(attr => attr.name === 'nodes');
+                                if (this.graphManager.nodes) {
-                        if (nodesAttribute && Array.isArray(nodesAttribute.value)) {
+                                    const updatedLargeEntity = this.graphManager.nodes.get(largeEntityId);
-                            nodesAttribute.value = nodesAttribute.value.filter(id => id !== nodeId);
+                                    if (updatedLargeEntity) {
                                        this.showNodeModal(updatedLargeEntity);
                                    }
                                }
                            }
                        }
-
+                    } catch (error) {
-                        // Find and update the 'count' attribute
+                        console.error('Error refreshing graph after extraction:', error);
                        const countAttribute = largeEntityNode.attributes.find(attr => attr.name === 'count');
                        if (countAttribute) {
                            countAttribute.value = (countAttribute.value || 0) - 1;
                        }
                        // Re-render the modal with the updated data
                        this.showNodeModal(largeEntityNode);
                    }
-                }
+                }, 100);
            } else {
                throw new Error(response.error || 'Extraction failed on the server.');
            }
        } catch (error) {
            console.error('Failed to extract node:', error);
            this.showError(`Extraction failed: ${error.message}`);
            // Restore button state on error
            const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`);
            if (button) {
                button.innerHTML = '[+]';
                button.disabled = false;
            }
        }
    }