From b2c5d2331c48e1b8a8cd4e71a3b932329ccdd3f5 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Sat, 20 Sep 2025 20:56:31 +0200 Subject: [PATCH] work on large entity extraction --- app.py | 59 +++++++++++++++-- core/scanner.py | 157 ++++++++++++++++++++++++++++++++++++++------- static/js/graph.js | 17 ++--- static/js/main.js | 69 ++++++++++++-------- 4 files changed, 241 insertions(+), 61 deletions(-) diff --git a/app.py b/app.py index 1dd1693..7ab2466 100644 --- a/app.py +++ b/app.py @@ -187,7 +187,9 @@ def get_graph_data(): @app.route('/api/graph/large-entity/extract', methods=['POST']) def extract_from_large_entity(): - """Extract a node from a large entity.""" + """ + FIXED: Extract a node from a large entity with proper error handling. + """ try: data = request.get_json() large_entity_id = data.get('large_entity_id') @@ -200,17 +202,66 @@ def extract_from_large_entity(): if not scanner: return jsonify({'success': False, 'error': 'No active session found'}), 404 + # FIXED: Check if node exists and provide better error messages + if not scanner.graph.graph.has_node(node_id): + return jsonify({ + 'success': False, + 'error': f'Node {node_id} not found in graph' + }), 404 + + # FIXED: Check if node is actually part of the large entity + node_data = scanner.graph.graph.nodes[node_id] + metadata = node_data.get('metadata', {}) + current_large_entity = metadata.get('large_entity_id') + + if not current_large_entity: + return jsonify({ + 'success': False, + 'error': f'Node {node_id} is not part of any large entity' + }), 400 + + if current_large_entity != large_entity_id: + return jsonify({ + 'success': False, + 'error': f'Node {node_id} belongs to large entity {current_large_entity}, not {large_entity_id}' + }), 400 + + # FIXED: Check if large entity exists + if not scanner.graph.graph.has_node(large_entity_id): + return jsonify({ + 'success': False, + 'error': f'Large entity {large_entity_id} not found' + }), 404 + + # Perform the extraction success = scanner.extract_node_from_large_entity(large_entity_id, node_id) if success: + # Force immediate session state update session_manager.update_session_scanner(user_session_id, scanner) - return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'}) + + return jsonify({ + 'success': True, + 'message': f'Node {node_id} extracted successfully from {large_entity_id}.', + 'extracted_node': node_id, + 'large_entity': large_entity_id + }) else: - return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500 + # This should not happen with the improved checks above, but handle it gracefully + return jsonify({ + 'success': False, + 'error': f'Failed to extract node {node_id} from {large_entity_id}. Node may have already been extracted.' + }), 409 + except json.JSONDecodeError: + return jsonify({'success': False, 'error': 'Invalid JSON in request body'}), 400 except Exception as e: traceback.print_exc() - return jsonify({'success': False, 'error': f'Internal server error: {str(e)}'}), 500 + return jsonify({ + 'success': False, + 'error': f'Internal server error: {str(e)}', + 'error_type': type(e).__name__ + }), 500 @app.route('/api/graph/node/', methods=['DELETE']) def delete_graph_node(node_id): diff --git a/core/scanner.py b/core/scanner.py index 5ebc1a4..236b7ca 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -860,7 +860,7 @@ class Scanner: def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool: """ - Removes a node from a large entity, allowing it to be processed normally. + FIXED: Extract a node from a large entity with proper backend updates and edge re-routing. """ if not self.graph.graph.has_node(node_id): return False @@ -868,31 +868,144 @@ class Scanner: node_data = self.graph.graph.nodes[node_id] metadata = node_data.get('metadata', {}) - if metadata.get('large_entity_id') == large_entity_id: - # Remove the large entity tag - del metadata['large_entity_id'] - self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata) + if metadata.get('large_entity_id') != large_entity_id: + return False + + # FIXED: Update the large entity's attributes to remove the extracted node + if self.graph.graph.has_node(large_entity_id): + le_node_data = self.graph.graph.nodes[large_entity_id] + le_attributes = le_node_data.get('attributes', []) - # Re-enqueue the node for full processing - is_ip = _is_valid_ip(node_id) - eligible_providers = self._get_eligible_providers(node_id, is_ip, False) - for provider in eligible_providers: - provider_name = provider.get_name() - priority = self._get_priority(provider_name) - # Use current depth of the large entity if available, else 0 - depth = 0 - if self.graph.graph.has_node(large_entity_id): - le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', []) - depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None) - if depth_attr: - depth = depth_attr['value'] + # Update the 'nodes' attribute to remove extracted node + nodes_attr = next((attr for attr in le_attributes if attr['name'] == 'nodes'), None) + if nodes_attr and isinstance(nodes_attr['value'], list): + if node_id in nodes_attr['value']: + nodes_attr['value'].remove(node_id) + + # Update the 'count' attribute + count_attr = next((attr for attr in le_attributes if attr['name'] == 'count'), None) + if count_attr and isinstance(count_attr['value'], (int, float)): + count_attr['value'] = max(0, count_attr['value'] - 1) + + # Update the large entity node + self.graph.add_node( + large_entity_id, + NodeType.LARGE_ENTITY, + attributes=le_attributes, + description=le_node_data.get('description', ''), + metadata=le_node_data.get('metadata', {}) + ) + + # Remove the large entity tag from extracted node + updated_metadata = metadata.copy() + del updated_metadata['large_entity_id'] + + # Add extraction history for forensic integrity + extraction_record = { + 'extracted_at': datetime.now(timezone.utc).isoformat(), + 'extracted_from': large_entity_id, + 'extraction_method': 'manual' + } + + if 'extraction_history' not in updated_metadata: + updated_metadata['extraction_history'] = [] + updated_metadata['extraction_history'].append(extraction_record) + + # Update the extracted node + self.graph.add_node(node_id, NodeType(node_data['type']), metadata=updated_metadata) + + # FIXED: Re-route edges that were pointing to the large entity + self._reroute_large_entity_edges(large_entity_id, node_id) + + # Re-enqueue the node for full processing + is_ip = _is_valid_ip(node_id) + eligible_providers = self._get_eligible_providers(node_id, is_ip, False) + for provider in eligible_providers: + provider_name = provider.get_name() + priority = self._get_priority(provider_name) + + # Use current depth of the large entity if available, else 0 + depth = 0 + if self.graph.graph.has_node(large_entity_id): + le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', []) + depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None) + if depth_attr: + depth = depth_attr['value'] - self.task_queue.put((time.time(), priority, (provider_name, node_id, depth))) - self.total_tasks_ever_enqueued += 1 + self.task_queue.put((time.time(), priority, (provider_name, node_id, depth))) + self.total_tasks_ever_enqueued += 1 + + # Force session state update for immediate frontend sync + self._update_session_state() + + return True + + def _reroute_large_entity_edges(self, large_entity_id: str, extracted_node_id: str) -> None: + """ + FIXED: Re-route edges from large entity to extracted node where appropriate. + """ + if not self.graph.graph.has_node(large_entity_id) or not self.graph.graph.has_node(extracted_node_id): + return + + edges_to_reroute = [] + + # Find edges pointing TO the large entity that should point to the extracted node + for source, target, edge_data in self.graph.graph.in_edges(large_entity_id, data=True): + # Check if this edge was originally meant for the extracted node + raw_data = edge_data.get('raw_data', {}) - return True + # If the raw data suggests this edge was for the extracted node, re-route it + if (raw_data.get('original_target') == extracted_node_id or + self._should_reroute_edge(edge_data, extracted_node_id)): + edges_to_reroute.append(('in', source, target, edge_data)) + + # Find edges pointing FROM the large entity that should point from the extracted node + for source, target, edge_data in self.graph.graph.out_edges(large_entity_id, data=True): + raw_data = edge_data.get('raw_data', {}) - return False + if (raw_data.get('original_source') == extracted_node_id or + self._should_reroute_edge(edge_data, extracted_node_id)): + edges_to_reroute.append(('out', source, target, edge_data)) + + # Re-route the edges + for direction, source, target, edge_data in edges_to_reroute: + # Remove old edge + self.graph.graph.remove_edge(source, target) + + # Add new edge with extracted node + if direction == 'in': + new_target = extracted_node_id + new_source = source + else: # direction == 'out' + new_source = extracted_node_id + new_target = target + + # Add the re-routed edge + self.graph.add_edge( + source_id=new_source, + target_id=new_target, + relationship_type=edge_data.get('relationship_type', 'unknown'), + confidence_score=edge_data.get('confidence_score', 0.5), + source_provider=edge_data.get('source_provider', 'rerouted'), + raw_data=dict(edge_data.get('raw_data', {}), **{'rerouted_from_large_entity': large_entity_id}) + ) + + def _should_reroute_edge(self, edge_data: dict, extracted_node_id: str) -> bool: + """ + Determine if an edge should be re-routed to an extracted node. + This is a heuristic-based approach since we don't store original targets. + """ + relationship_type = edge_data.get('relationship_type', '') + + # For now, re-route DNS and certificate-based relationships + # These are likely to be node-specific rather than entity-wide + reroutable_types = [ + 'dns_a_record', 'dns_aaaa_record', 'dns_cname_record', + 'dns_mx_record', 'dns_ptr_record', + 'crtsh_san_certificate', 'crtsh_cert_issuer' + ] + + return any(rtype in relationship_type for rtype in reroutable_types) def _process_provider_result_unified(self, target: str, provider: BaseProvider, provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: diff --git a/static/js/graph.js b/static/js/graph.js index 89efeae..d2b8ae6 100644 --- a/static/js/graph.js +++ b/static/js/graph.js @@ -353,9 +353,6 @@ class GraphManager { }); } - /** - * @param {Object} graphData - Graph data from backend - */ updateGraph(graphData) { if (!graphData || !graphData.nodes || !graphData.edges) { console.warn('Invalid graph data received'); @@ -382,16 +379,18 @@ class GraphManager { const nodeMap = new Map(graphData.nodes.map(node => [node.id, node])); - // Filter out hidden nodes before processing for rendering - const filteredNodes = graphData.nodes.filter(node => - !(node.metadata && node.metadata.large_entity_id) - ); - + // FIXED: Process all nodes first, then apply hiding logic correctly const processedNodes = graphData.nodes.map(node => { const processed = this.processNode(node); + + // FIXED: Only hide if node is still a large entity member if (node.metadata && node.metadata.large_entity_id) { processed.hidden = true; + } else { + // FIXED: Ensure extracted nodes are visible + processed.hidden = false; } + return processed; }); @@ -401,6 +400,7 @@ class GraphManager { let fromId = edge.from; let toId = edge.to; + // FIXED: Only re-route if nodes are STILL in large entities if (fromNode && fromNode.metadata && fromNode.metadata.large_entity_id) { fromId = fromNode.metadata.large_entity_id; } @@ -423,6 +423,7 @@ class GraphManager { const newNodes = processedNodes.filter(node => !existingNodeIds.includes(node.id)); const newEdges = processedEdges.filter(edge => !existingEdgeIds.includes(edge.id)); + // FIXED: Update all nodes to ensure extracted nodes become visible this.nodes.update(processedNodes); this.edges.update(processedEdges); diff --git a/static/js/main.js b/static/js/main.js index 1d4b63a..0721d53 100644 --- a/static/js/main.js +++ b/static/js/main.js @@ -2023,6 +2023,16 @@ class DNSReconApp { async extractNode(largeEntityId, nodeId) { try { + console.log(`Extracting node ${nodeId} from large entity ${largeEntityId}`); + + // Show immediate feedback + const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`); + if (button) { + const originalContent = button.innerHTML; + button.innerHTML = '[...]'; + button.disabled = true; + } + const response = await this.apiCall('/api/graph/large-entity/extract', 'POST', { large_entity_id: largeEntityId, node_id: nodeId, @@ -2031,41 +2041,46 @@ class DNSReconApp { if (response.success) { this.showSuccess(response.message); - // If the scanner was idle, it's now running. Start polling to see the new node appear. - if (this.scanStatus === 'idle') { - this.startPolling(1000); - } else { - // If already scanning, force a quick graph update to see the change sooner. - setTimeout(() => this.updateGraph(), 500); - } - - // Immediately update the modal view - if (this.graphManager) { - const largeEntityNode = this.graphManager.nodes.get(largeEntityId); - if (largeEntityNode && largeEntityNode.attributes) { - - // Find and update the 'nodes' attribute - const nodesAttribute = largeEntityNode.attributes.find(attr => attr.name === 'nodes'); - if (nodesAttribute && Array.isArray(nodesAttribute.value)) { - nodesAttribute.value = nodesAttribute.value.filter(id => id !== nodeId); + // FIXED: Don't update local modal data - let backend be source of truth + // Force immediate graph update to get fresh backend data + console.log('Extraction successful, updating graph with fresh backend data'); + await this.updateGraph(); + + // FIXED: Re-fetch graph data instead of manipulating local state + setTimeout(async () => { + try { + const graphResponse = await this.apiCall('/api/graph'); + if (graphResponse.success) { + this.graphManager.updateGraph(graphResponse.graph); + + // Update modal with fresh data if still open + if (this.elements.nodeModal && this.elements.nodeModal.style.display === 'block') { + if (this.graphManager.nodes) { + const updatedLargeEntity = this.graphManager.nodes.get(largeEntityId); + if (updatedLargeEntity) { + this.showNodeModal(updatedLargeEntity); + } + } + } } - - // Find and update the 'count' attribute - const countAttribute = largeEntityNode.attributes.find(attr => attr.name === 'count'); - if (countAttribute) { - countAttribute.value = (countAttribute.value || 0) - 1; - } - - // Re-render the modal with the updated data - this.showNodeModal(largeEntityNode); + } catch (error) { + console.error('Error refreshing graph after extraction:', error); } - } + }, 100); + } else { throw new Error(response.error || 'Extraction failed on the server.'); } } catch (error) { console.error('Failed to extract node:', error); this.showError(`Extraction failed: ${error.message}`); + + // Restore button state on error + const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`); + if (button) { + button.innerHTML = '[+]'; + button.disabled = false; + } } }