work on large entity extraction

This commit is contained in:
overcuriousity 2025-09-20 20:56:31 +02:00
parent 602739246f
commit b2c5d2331c
4 changed files with 241 additions and 61 deletions

59
app.py
View File

@ -187,7 +187,9 @@ def get_graph_data():
@app.route('/api/graph/large-entity/extract', methods=['POST']) @app.route('/api/graph/large-entity/extract', methods=['POST'])
def extract_from_large_entity(): def extract_from_large_entity():
"""Extract a node from a large entity.""" """
FIXED: Extract a node from a large entity with proper error handling.
"""
try: try:
data = request.get_json() data = request.get_json()
large_entity_id = data.get('large_entity_id') large_entity_id = data.get('large_entity_id')
@ -200,17 +202,66 @@ def extract_from_large_entity():
if not scanner: if not scanner:
return jsonify({'success': False, 'error': 'No active session found'}), 404 return jsonify({'success': False, 'error': 'No active session found'}), 404
# FIXED: Check if node exists and provide better error messages
if not scanner.graph.graph.has_node(node_id):
return jsonify({
'success': False,
'error': f'Node {node_id} not found in graph'
}), 404
# FIXED: Check if node is actually part of the large entity
node_data = scanner.graph.graph.nodes[node_id]
metadata = node_data.get('metadata', {})
current_large_entity = metadata.get('large_entity_id')
if not current_large_entity:
return jsonify({
'success': False,
'error': f'Node {node_id} is not part of any large entity'
}), 400
if current_large_entity != large_entity_id:
return jsonify({
'success': False,
'error': f'Node {node_id} belongs to large entity {current_large_entity}, not {large_entity_id}'
}), 400
# FIXED: Check if large entity exists
if not scanner.graph.graph.has_node(large_entity_id):
return jsonify({
'success': False,
'error': f'Large entity {large_entity_id} not found'
}), 404
# Perform the extraction
success = scanner.extract_node_from_large_entity(large_entity_id, node_id) success = scanner.extract_node_from_large_entity(large_entity_id, node_id)
if success: if success:
# Force immediate session state update
session_manager.update_session_scanner(user_session_id, scanner) session_manager.update_session_scanner(user_session_id, scanner)
return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'})
return jsonify({
'success': True,
'message': f'Node {node_id} extracted successfully from {large_entity_id}.',
'extracted_node': node_id,
'large_entity': large_entity_id
})
else: else:
return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500 # This should not happen with the improved checks above, but handle it gracefully
return jsonify({
'success': False,
'error': f'Failed to extract node {node_id} from {large_entity_id}. Node may have already been extracted.'
}), 409
except json.JSONDecodeError:
return jsonify({'success': False, 'error': 'Invalid JSON in request body'}), 400
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
return jsonify({'success': False, 'error': f'Internal server error: {str(e)}'}), 500 return jsonify({
'success': False,
'error': f'Internal server error: {str(e)}',
'error_type': type(e).__name__
}), 500
@app.route('/api/graph/node/<node_id>', methods=['DELETE']) @app.route('/api/graph/node/<node_id>', methods=['DELETE'])
def delete_graph_node(node_id): def delete_graph_node(node_id):

View File

@ -860,7 +860,7 @@ class Scanner:
def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool: def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool:
""" """
Removes a node from a large entity, allowing it to be processed normally. FIXED: Extract a node from a large entity with proper backend updates and edge re-routing.
""" """
if not self.graph.graph.has_node(node_id): if not self.graph.graph.has_node(node_id):
return False return False
@ -868,31 +868,144 @@ class Scanner:
node_data = self.graph.graph.nodes[node_id] node_data = self.graph.graph.nodes[node_id]
metadata = node_data.get('metadata', {}) metadata = node_data.get('metadata', {})
if metadata.get('large_entity_id') == large_entity_id: if metadata.get('large_entity_id') != large_entity_id:
# Remove the large entity tag return False
del metadata['large_entity_id']
self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata) # FIXED: Update the large entity's attributes to remove the extracted node
if self.graph.graph.has_node(large_entity_id):
le_node_data = self.graph.graph.nodes[large_entity_id]
le_attributes = le_node_data.get('attributes', [])
# Re-enqueue the node for full processing # Update the 'nodes' attribute to remove extracted node
is_ip = _is_valid_ip(node_id) nodes_attr = next((attr for attr in le_attributes if attr['name'] == 'nodes'), None)
eligible_providers = self._get_eligible_providers(node_id, is_ip, False) if nodes_attr and isinstance(nodes_attr['value'], list):
for provider in eligible_providers: if node_id in nodes_attr['value']:
provider_name = provider.get_name() nodes_attr['value'].remove(node_id)
priority = self._get_priority(provider_name)
# Use current depth of the large entity if available, else 0 # Update the 'count' attribute
depth = 0 count_attr = next((attr for attr in le_attributes if attr['name'] == 'count'), None)
if self.graph.graph.has_node(large_entity_id): if count_attr and isinstance(count_attr['value'], (int, float)):
le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', []) count_attr['value'] = max(0, count_attr['value'] - 1)
depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
if depth_attr: # Update the large entity node
depth = depth_attr['value'] self.graph.add_node(
large_entity_id,
NodeType.LARGE_ENTITY,
attributes=le_attributes,
description=le_node_data.get('description', ''),
metadata=le_node_data.get('metadata', {})
)
# Remove the large entity tag from extracted node
updated_metadata = metadata.copy()
del updated_metadata['large_entity_id']
# Add extraction history for forensic integrity
extraction_record = {
'extracted_at': datetime.now(timezone.utc).isoformat(),
'extracted_from': large_entity_id,
'extraction_method': 'manual'
}
if 'extraction_history' not in updated_metadata:
updated_metadata['extraction_history'] = []
updated_metadata['extraction_history'].append(extraction_record)
# Update the extracted node
self.graph.add_node(node_id, NodeType(node_data['type']), metadata=updated_metadata)
# FIXED: Re-route edges that were pointing to the large entity
self._reroute_large_entity_edges(large_entity_id, node_id)
# Re-enqueue the node for full processing
is_ip = _is_valid_ip(node_id)
eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
for provider in eligible_providers:
provider_name = provider.get_name()
priority = self._get_priority(provider_name)
# Use current depth of the large entity if available, else 0
depth = 0
if self.graph.graph.has_node(large_entity_id):
le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
if depth_attr:
depth = depth_attr['value']
self.task_queue.put((time.time(), priority, (provider_name, node_id, depth))) self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
self.total_tasks_ever_enqueued += 1 self.total_tasks_ever_enqueued += 1
# Force session state update for immediate frontend sync
self._update_session_state()
return True
def _reroute_large_entity_edges(self, large_entity_id: str, extracted_node_id: str) -> None:
"""
FIXED: Re-route edges from large entity to extracted node where appropriate.
"""
if not self.graph.graph.has_node(large_entity_id) or not self.graph.graph.has_node(extracted_node_id):
return
edges_to_reroute = []
# Find edges pointing TO the large entity that should point to the extracted node
for source, target, edge_data in self.graph.graph.in_edges(large_entity_id, data=True):
# Check if this edge was originally meant for the extracted node
raw_data = edge_data.get('raw_data', {})
return True # If the raw data suggests this edge was for the extracted node, re-route it
if (raw_data.get('original_target') == extracted_node_id or
self._should_reroute_edge(edge_data, extracted_node_id)):
edges_to_reroute.append(('in', source, target, edge_data))
# Find edges pointing FROM the large entity that should point from the extracted node
for source, target, edge_data in self.graph.graph.out_edges(large_entity_id, data=True):
raw_data = edge_data.get('raw_data', {})
return False if (raw_data.get('original_source') == extracted_node_id or
self._should_reroute_edge(edge_data, extracted_node_id)):
edges_to_reroute.append(('out', source, target, edge_data))
# Re-route the edges
for direction, source, target, edge_data in edges_to_reroute:
# Remove old edge
self.graph.graph.remove_edge(source, target)
# Add new edge with extracted node
if direction == 'in':
new_target = extracted_node_id
new_source = source
else: # direction == 'out'
new_source = extracted_node_id
new_target = target
# Add the re-routed edge
self.graph.add_edge(
source_id=new_source,
target_id=new_target,
relationship_type=edge_data.get('relationship_type', 'unknown'),
confidence_score=edge_data.get('confidence_score', 0.5),
source_provider=edge_data.get('source_provider', 'rerouted'),
raw_data=dict(edge_data.get('raw_data', {}), **{'rerouted_from_large_entity': large_entity_id})
)
def _should_reroute_edge(self, edge_data: dict, extracted_node_id: str) -> bool:
"""
Determine if an edge should be re-routed to an extracted node.
This is a heuristic-based approach since we don't store original targets.
"""
relationship_type = edge_data.get('relationship_type', '')
# For now, re-route DNS and certificate-based relationships
# These are likely to be node-specific rather than entity-wide
reroutable_types = [
'dns_a_record', 'dns_aaaa_record', 'dns_cname_record',
'dns_mx_record', 'dns_ptr_record',
'crtsh_san_certificate', 'crtsh_cert_issuer'
]
return any(rtype in relationship_type for rtype in reroutable_types)
def _process_provider_result_unified(self, target: str, provider: BaseProvider, def _process_provider_result_unified(self, target: str, provider: BaseProvider,
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:

View File

@ -353,9 +353,6 @@ class GraphManager {
}); });
} }
/**
* @param {Object} graphData - Graph data from backend
*/
updateGraph(graphData) { updateGraph(graphData) {
if (!graphData || !graphData.nodes || !graphData.edges) { if (!graphData || !graphData.nodes || !graphData.edges) {
console.warn('Invalid graph data received'); console.warn('Invalid graph data received');
@ -382,16 +379,18 @@ class GraphManager {
const nodeMap = new Map(graphData.nodes.map(node => [node.id, node])); const nodeMap = new Map(graphData.nodes.map(node => [node.id, node]));
// Filter out hidden nodes before processing for rendering // FIXED: Process all nodes first, then apply hiding logic correctly
const filteredNodes = graphData.nodes.filter(node =>
!(node.metadata && node.metadata.large_entity_id)
);
const processedNodes = graphData.nodes.map(node => { const processedNodes = graphData.nodes.map(node => {
const processed = this.processNode(node); const processed = this.processNode(node);
// FIXED: Only hide if node is still a large entity member
if (node.metadata && node.metadata.large_entity_id) { if (node.metadata && node.metadata.large_entity_id) {
processed.hidden = true; processed.hidden = true;
} else {
// FIXED: Ensure extracted nodes are visible
processed.hidden = false;
} }
return processed; return processed;
}); });
@ -401,6 +400,7 @@ class GraphManager {
let fromId = edge.from; let fromId = edge.from;
let toId = edge.to; let toId = edge.to;
// FIXED: Only re-route if nodes are STILL in large entities
if (fromNode && fromNode.metadata && fromNode.metadata.large_entity_id) { if (fromNode && fromNode.metadata && fromNode.metadata.large_entity_id) {
fromId = fromNode.metadata.large_entity_id; fromId = fromNode.metadata.large_entity_id;
} }
@ -423,6 +423,7 @@ class GraphManager {
const newNodes = processedNodes.filter(node => !existingNodeIds.includes(node.id)); const newNodes = processedNodes.filter(node => !existingNodeIds.includes(node.id));
const newEdges = processedEdges.filter(edge => !existingEdgeIds.includes(edge.id)); const newEdges = processedEdges.filter(edge => !existingEdgeIds.includes(edge.id));
// FIXED: Update all nodes to ensure extracted nodes become visible
this.nodes.update(processedNodes); this.nodes.update(processedNodes);
this.edges.update(processedEdges); this.edges.update(processedEdges);

View File

@ -2023,6 +2023,16 @@ class DNSReconApp {
async extractNode(largeEntityId, nodeId) { async extractNode(largeEntityId, nodeId) {
try { try {
console.log(`Extracting node ${nodeId} from large entity ${largeEntityId}`);
// Show immediate feedback
const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`);
if (button) {
const originalContent = button.innerHTML;
button.innerHTML = '[...]';
button.disabled = true;
}
const response = await this.apiCall('/api/graph/large-entity/extract', 'POST', { const response = await this.apiCall('/api/graph/large-entity/extract', 'POST', {
large_entity_id: largeEntityId, large_entity_id: largeEntityId,
node_id: nodeId, node_id: nodeId,
@ -2031,41 +2041,46 @@ class DNSReconApp {
if (response.success) { if (response.success) {
this.showSuccess(response.message); this.showSuccess(response.message);
// If the scanner was idle, it's now running. Start polling to see the new node appear. // FIXED: Don't update local modal data - let backend be source of truth
if (this.scanStatus === 'idle') { // Force immediate graph update to get fresh backend data
this.startPolling(1000); console.log('Extraction successful, updating graph with fresh backend data');
} else { await this.updateGraph();
// If already scanning, force a quick graph update to see the change sooner.
setTimeout(() => this.updateGraph(), 500); // FIXED: Re-fetch graph data instead of manipulating local state
} setTimeout(async () => {
try {
// Immediately update the modal view const graphResponse = await this.apiCall('/api/graph');
if (this.graphManager) { if (graphResponse.success) {
const largeEntityNode = this.graphManager.nodes.get(largeEntityId); this.graphManager.updateGraph(graphResponse.graph);
if (largeEntityNode && largeEntityNode.attributes) {
// Update modal with fresh data if still open
// Find and update the 'nodes' attribute if (this.elements.nodeModal && this.elements.nodeModal.style.display === 'block') {
const nodesAttribute = largeEntityNode.attributes.find(attr => attr.name === 'nodes'); if (this.graphManager.nodes) {
if (nodesAttribute && Array.isArray(nodesAttribute.value)) { const updatedLargeEntity = this.graphManager.nodes.get(largeEntityId);
nodesAttribute.value = nodesAttribute.value.filter(id => id !== nodeId); if (updatedLargeEntity) {
this.showNodeModal(updatedLargeEntity);
}
}
}
} }
} catch (error) {
// Find and update the 'count' attribute console.error('Error refreshing graph after extraction:', error);
const countAttribute = largeEntityNode.attributes.find(attr => attr.name === 'count');
if (countAttribute) {
countAttribute.value = (countAttribute.value || 0) - 1;
}
// Re-render the modal with the updated data
this.showNodeModal(largeEntityNode);
} }
} }, 100);
} else { } else {
throw new Error(response.error || 'Extraction failed on the server.'); throw new Error(response.error || 'Extraction failed on the server.');
} }
} catch (error) { } catch (error) {
console.error('Failed to extract node:', error); console.error('Failed to extract node:', error);
this.showError(`Extraction failed: ${error.message}`); this.showError(`Extraction failed: ${error.message}`);
// Restore button state on error
const button = document.querySelector(`[data-node-id="${nodeId}"][data-large-entity-id="${largeEntityId}"]`);
if (button) {
button.innerHTML = '[+]';
button.disabled = false;
}
} }
} }