From 2c483164779a9686ed667d255f53e9ff66fca64e Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Tue, 16 Sep 2025 00:01:24 +0200 Subject: [PATCH] extract from node feature --- app.py | 27 ++++++++++++++ core/graph_manager.py | 24 +++++++++++++ core/scanner.py | 56 +++++++++++++++++++++++++++-- providers/base_provider.py | 2 -- providers/crtsh_provider.py | 14 ++++++-- static/js/graph.js | 4 +-- static/js/main.js | 70 +++++++++++++++++++++++++++++++++---- 7 files changed, 182 insertions(+), 15 deletions(-) diff --git a/app.py b/app.py index aff5ee5..b2233ff 100644 --- a/app.py +++ b/app.py @@ -282,6 +282,33 @@ def get_graph_data(): } }), 500 +@app.route('/api/graph/large-entity/extract', methods=['POST']) +def extract_from_large_entity(): + """Extract a node from a large entity, making it a standalone node.""" + try: + data = request.get_json() + large_entity_id = data.get('large_entity_id') + node_id = data.get('node_id') + + if not large_entity_id or not node_id: + return jsonify({'success': False, 'error': 'Missing required parameters'}), 400 + + user_session_id, scanner = get_user_scanner() + if not scanner: + return jsonify({'success': False, 'error': 'No active session found'}), 404 + + success = scanner.extract_node_from_large_entity(large_entity_id, node_id) + + if success: + session_manager.update_session_scanner(user_session_id, scanner) + return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'}) + else: + return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500 + + except Exception as e: + print(f"ERROR: Exception in extract_from_large_entity endpoint: {e}") + traceback.print_exc() + return jsonify({'success': False, 'error': f'Internal server error: {str(e)}'}), 500 @app.route('/api/graph/node/', methods=['DELETE']) def delete_graph_node(node_id): diff --git a/core/graph_manager.py b/core/graph_manager.py index 558503e..6086ea8 100644 --- a/core/graph_manager.py +++ b/core/graph_manager.py @@ -413,6 +413,30 @@ class GraphManager: raw_data=raw_data or {}) self.last_modified = datetime.now(timezone.utc).isoformat() return True + + def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool: + """ + Removes a node from a large entity's internal lists and updates its count. + This prepares the large entity for the node's promotion to a regular node. + """ + if not self.graph.has_node(large_entity_id): + return False + + node_data = self.graph.nodes[large_entity_id] + attributes = node_data.get('attributes', {}) + + # Remove from the list of member nodes + if 'nodes' in attributes and node_id_to_extract in attributes['nodes']: + attributes['nodes'].remove(node_id_to_extract) + # Update the count + attributes['count'] = len(attributes['nodes']) + else: + # This can happen if the node was already extracted, which is not an error. + print(f"Warning: Node {node_id_to_extract} not found in the 'nodes' list of {large_entity_id}.") + return True # Proceed as if successful + + self.last_modified = datetime.now(timezone.utc).isoformat() + return True def remove_node(self, node_id: str) -> bool: """Remove a node and its connected edges from the graph.""" diff --git a/core/scanner.py b/core/scanner.py index fd7e4e6..b59059e 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -162,12 +162,15 @@ class Scanner: self.stop_event = threading.Event() self.scan_thread = None self.executor = None - self.processing_lock = threading.Lock() # **NEW**: Recreate processing lock + self.processing_lock = threading.Lock() self.task_queue = PriorityQueue() self.rate_limiter = GlobalRateLimiter(redis.StrictRedis(db=0)) self.logger = get_forensic_logger() + + # This ensures the scanner has access to providers for actions like node extraction. + print("Re-initializing providers after loading session...") + self._initialize_providers() - # **NEW**: Reset processing tracking if not hasattr(self, 'currently_processing'): self.currently_processing = set() @@ -792,6 +795,7 @@ class Scanner: elif _is_valid_ip(targets[0]): node_type = 'ip' + # We still create the nodes so they exist in the graph, they are just not processed for edges yet. for target in targets: self.graph.add_node(target, NodeType.DOMAIN if node_type == 'domain' else NodeType.IP) @@ -802,6 +806,7 @@ class Scanner: 'source_provider': provider_name, 'discovery_depth': current_depth, 'threshold_exceeded': self.config.large_entity_threshold, + # <<< FIX: Removed 'raw_results'. It's inefficient and unnecessary. } description = f'Large entity created due to {len(targets)} results from {provider_name}' @@ -816,6 +821,53 @@ class Scanner: print(f"Created large entity {entity_id} for {len(targets)} {node_type}s from {provider_name}") return set(targets) + + def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool: + """ + Extracts a node from a large entity by re-adding it to the main processing queue. + This is a much cleaner approach than storing and replaying raw results. + """ + if not self.graph.graph.has_node(large_entity_id): + print(f"ERROR: Large entity {large_entity_id} not found.") + return False + + # 1. Modify the graph data structure first + # This removes the node from the container's internal list. + success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract) + if not success: + print(f"ERROR: Node {node_id_to_extract} could not be removed from {large_entity_id}'s attributes.") + return False + + # 2. Re-queue the extracted node for full processing by all eligible providers. + # This is the same logic used for any newly discovered node. + print(f"Re-queueing extracted node {node_id_to_extract} for full reconnaissance...") + is_ip = _is_valid_ip(node_id_to_extract) + current_depth = self.graph.graph.nodes[large_entity_id].get('attributes', {}).get('discovery_depth', 0) + + eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False) + for provider in eligible_providers: + provider_name = provider.get_name() + # Add the task to the main queue with the correct depth. + self.task_queue.put((self._get_priority(provider_name), (provider_name, node_id_to_extract, current_depth))) + self.total_tasks_ever_enqueued += 1 + + # 3. If the scanner is not running, we need to kickstart it to process this one item. + if self.status != ScanStatus.RUNNING: + print("Scanner is idle. Starting a mini-scan to process the extracted node.") + self.status = ScanStatus.RUNNING + self._update_session_state() + + # Start a new thread for the scan execution if one isn't running + if not self.scan_thread or not self.scan_thread.is_alive(): + self.scan_thread = threading.Thread( + target=self._execute_scan, + args=(self.current_target, self.max_depth), # Use existing target/depth + daemon=True + ) + self.scan_thread.start() + + print(f"Successfully extracted and re-queued {node_id_to_extract} from {large_entity_id}.") + return True def _collect_node_attributes(self, node_id: str, provider_name: str, rel_type: str, target: str, raw_data: Dict[str, Any], attributes: Dict[str, Any]) -> None: diff --git a/providers/base_provider.py b/providers/base_provider.py index b0ce905..7941fb6 100644 --- a/providers/base_provider.py +++ b/providers/base_provider.py @@ -3,7 +3,6 @@ import time import requests import threading -import redis from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional, Tuple @@ -36,7 +35,6 @@ class BaseProvider(ABC): # Fallback to global config for backwards compatibility from config import config as global_config self.config = global_config - actual_rate_limit = rate_limit actual_timeout = timeout self.name = name diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py index f70b09a..7b061a2 100644 --- a/providers/crtsh_provider.py +++ b/providers/crtsh_provider.py @@ -514,12 +514,20 @@ class CrtShProvider(BaseProvider): shared = [] # Create a set of certificate IDs from the first list for quick lookup - cert1_ids = {cert.get('certificate_id') for cert in certs1 if cert.get('certificate_id')} + # <<< FIX: Added robust type checking to handle potentially malformed API data + cert1_ids = set() + for cert in certs1: + cert_id = cert.get('certificate_id') + # Ensure the ID is not None and is a hashable type before adding to the set + if cert_id and isinstance(cert_id, (int, str, float, bool, tuple)): + cert1_ids.add(cert_id) # Find certificates in the second list that match for cert in certs2: - if cert.get('certificate_id') in cert1_ids: - shared.append(cert) + cert_id = cert.get('certificate_id') + if cert_id and isinstance(cert_id, (int, str, float, bool, tuple)): + if cert_id in cert1_ids: + shared.append(cert) return shared diff --git a/static/js/graph.js b/static/js/graph.js index 9a52f1e..07e4609 100644 --- a/static/js/graph.js +++ b/static/js/graph.js @@ -389,8 +389,8 @@ class GraphManager { }); const filteredNodes = graphData.nodes.filter(node => { - // Only include nodes that are NOT members of large entities - return !this.largeEntityMembers.has(node.id); + // Only include nodes that are NOT members of large entities, but always include the container itself + return !this.largeEntityMembers.has(node.id) || node.type === 'large_entity'; }); console.log(`Filtered ${graphData.nodes.length - filteredNodes.length} large entity member nodes from visualization`); diff --git a/static/js/main.js b/static/js/main.js index 3a2ecdd..547b59d 100644 --- a/static/js/main.js +++ b/static/js/main.js @@ -189,7 +189,7 @@ class DNSReconApp { this.elements.resetApiKeys.addEventListener('click', () => this.resetApiKeys()); } - // ** FIX: Listen for the custom event from the graph ** + // Listen for the custom event from the graph document.addEventListener('nodeSelected', (e) => { this.showNodeModal(e.detail.node); }); @@ -1092,8 +1092,6 @@ class DNSReconApp { return html; } - - generateLargeEntityDetails(node) { const attributes = node.attributes || {}; const nodes = attributes.nodes || []; @@ -1123,16 +1121,23 @@ class DNSReconApp {