From 332805709d4ddcc1f551272b34f5a4a0265ab175 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Thu, 18 Sep 2025 23:44:24 +0200 Subject: [PATCH 1/5] remove --- app.py | 13 +-- core/graph_manager.py | 30 ------- core/scanner.py | 194 ++---------------------------------------- static/js/graph.js | 27 +----- 4 files changed, 14 insertions(+), 250 deletions(-) diff --git a/app.py b/app.py index 1dd1693..701f8e8 100644 --- a/app.py +++ b/app.py @@ -200,13 +200,14 @@ def extract_from_large_entity(): if not scanner: return jsonify({'success': False, 'error': 'No active session found'}), 404 - success = scanner.extract_node_from_large_entity(large_entity_id, node_id) + # TODO implement + #success = scanner.extract_node_from_large_entity(large_entity_id, node_id) - if success: - session_manager.update_session_scanner(user_session_id, scanner) - return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'}) - else: - return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500 + #if success: + # session_manager.update_session_scanner(user_session_id, scanner) + # return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'}) + #else: + # return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500 except Exception as e: traceback.print_exc() diff --git a/core/graph_manager.py b/core/graph_manager.py index 74fb46c..4abe919 100644 --- a/core/graph_manager.py +++ b/core/graph_manager.py @@ -114,36 +114,6 @@ class GraphManager: self.last_modified = datetime.now(timezone.utc).isoformat() return True - def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool: - """ - Removes a node from a large entity's internal lists and updates its count. - This prepares the large entity for the node's promotion to a regular node. - """ - if not self.graph.has_node(large_entity_id): - return False - - node_data = self.graph.nodes[large_entity_id] - attributes = node_data.get('attributes', []) - - # Find the 'nodes' attribute dictionary in the list - nodes_attr = next((attr for attr in attributes if attr.get('name') == 'nodes'), None) - - # Remove from the list of member nodes - if nodes_attr and 'value' in nodes_attr and isinstance(nodes_attr['value'], list) and node_id_to_extract in nodes_attr['value']: - nodes_attr['value'].remove(node_id_to_extract) - - # Find the 'count' attribute and update it - count_attr = next((attr for attr in attributes if attr.get('name') == 'count'), None) - if count_attr: - count_attr['value'] = len(nodes_attr['value']) - else: - # This can happen if the node was already extracted, which is not an error. - print(f"Warning: Node {node_id_to_extract} not found in the 'nodes' list of {large_entity_id}.") - return True # Proceed as if successful - - self.last_modified = datetime.now(timezone.utc).isoformat() - return True - def remove_node(self, node_id: str) -> bool: """Remove a node and its connected edges from the graph.""" if not self.graph.has_node(node_id): diff --git a/core/scanner.py b/core/scanner.py index bd6c8e9..43c9d9d 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -785,10 +785,7 @@ class Scanner: discovered, is_large_entity = self._process_provider_result_unified( target, provider, provider_result, depth ) - if is_large_entity: - large_entity_members.update(discovered) - else: - new_targets.update(discovered) + new_targets.update(discovered) except Exception as e: provider_successful = False @@ -833,7 +830,7 @@ class Scanner: """ provider_name = provider.get_name() discovered_targets = set() - large_entity_members = set() + #large_entity_members = set() if self._is_stop_requested(): return discovered_targets, False @@ -845,11 +842,11 @@ class Scanner: is_large_entity = eligible_relationship_count > self.config.large_entity_threshold - if is_large_entity: + #if is_large_entity: # Create the large entity node and get the set of its members - large_entity_members = self._create_large_entity_from_provider_result( - target, provider_name, provider_result, current_depth - ) + #large_entity_members = self._create_large_entity_from_result( + # target, provider_name, provider_result, current_depth + #) # Process ALL relationships to build the complete underlying data model for i, relationship in enumerate(provider_result.relationships): @@ -909,64 +906,6 @@ class Scanner: return discovered_targets, is_large_entity - def _create_large_entity_from_provider_result(self, source: str, provider_name: str, - provider_result: ProviderResult, current_depth: int) -> Set[str]: - """ - Create a large entity node and connect it to the source and any shared - non-member nodes like CAs or ISPs. - """ - entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}" - - members = { - rel.target_node for rel in provider_result.relationships - if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node) - } - - if not members: - return set() - - first_member = next(iter(members)) - node_type = 'ip' if _is_valid_ip(first_member) else 'domain' - - attributes_dict = { - 'count': len(members), - 'nodes': list(members), - 'node_type': node_type, - 'source_provider': provider_name, - 'discovery_depth': current_depth, - 'threshold_exceeded': self.config.large_entity_threshold, - } - attributes_list = [ - { - "name": key, "value": value, "type": "large_entity_info", - "provider": provider_name, "confidence": 0.9, "metadata": {} - } for key, value in attributes_dict.items() - ] - - description = f'Large entity created due to {len(members)} relationships from {provider_name}' - - self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description) - - # Add a representative edge from the source to the large entity - if provider_result.relationships: - rep_rel = provider_result.relationships[0] - self.graph.add_edge(source, entity_id, rep_rel.relationship_type, 0.9, provider_name, - {'large_entity_info': f'Contains {len(members)} {node_type}s'}) - - # Create edges from the large entity to shared non-member nodes (e.g., CAs, ISPs) - processed_targets = set() - for rel in provider_result.relationships: - if rel.source_node in members and rel.target_node not in members: - if rel.target_node not in processed_targets: - self.graph.add_edge( - entity_id, rel.target_node, rel.relationship_type, rel.confidence, - rel.provider, rel.raw_data - ) - processed_targets.add(rel.target_node) - - self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(members)} targets from {provider_name}") - - return members def stop_scan(self) -> bool: """Request immediate scan termination with proper cleanup.""" @@ -995,127 +934,6 @@ class Scanner: traceback.print_exc() return False - def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool: - """ - Extracts a node from a large entity, restores ALL of its original connections, - and re-queues it for scanning. - """ - if not self.graph.graph.has_node(large_entity_id): - return False - - # Extract the node from the large entity's internal list - success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract) - if not success: - return False - - # Restore all incoming and outgoing edges for the extracted node - # These edges already exist in the graph data model; this ensures they are "activated" - # for the frontend. - for u, v, data in self.graph.graph.in_edges(node_id_to_extract, data=True): - self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'), - data.get('source_provider'), data.get('raw_data')) - - for u, v, data in self.graph.graph.out_edges(node_id_to_extract, data=True): - self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'), - data.get('source_provider'), data.get('raw_data')) - - # Re-queue the extracted node for further scanning if it is a domain or IP - is_ip = _is_valid_ip(node_id_to_extract) - is_domain = _is_valid_domain(node_id_to_extract) - - if is_domain or is_ip: - large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', []) - discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None) - current_depth = discovery_depth_attr['value'] if discovery_depth_attr else 0 - - eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False) - for provider in eligible_providers: - # Exclude DNS and correlation providers from re-processing - if provider.get_name() not in ['dns', 'correlation']: - provider_name = provider.get_name() - priority = self._get_priority(provider_name) - self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth))) - self.total_tasks_ever_enqueued += 1 - - if self.status != ScanStatus.RUNNING: - self.status = ScanStatus.RUNNING - self._update_session_state() - - if not self.scan_thread or not self.scan_thread.is_alive(): - self.scan_thread = threading.Thread( - target=self._execute_scan, - args=(self.current_target, self.max_depth), - daemon=True - ) - self.scan_thread.start() - else: - self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}") - - return True - - def _determine_extracted_node_type(self, node_id: str, large_entity_id: str) -> NodeType: - """ - FIXED: Determine the correct node type for a node being extracted from a large entity. - Uses multiple strategies to ensure accurate type detection. - """ - from utils.helpers import _is_valid_ip, _is_valid_domain - - # Strategy 1: Check if node already exists in graph with a type - if self.graph.has_node(node_id): - existing_type = self.graph.nodes[node_id].get('type') - if existing_type: - try: - return NodeType(existing_type) - except ValueError: - pass - - # Strategy 2: Look for existing relationships to this node to infer type - for source, target, edge_data in self.graph.edges(data=True): - if target == node_id: - rel_type = edge_data.get('relationship_type', '') - provider = edge_data.get('source_provider', '') - - # CA nodes from certificate issuer relationships - if provider == 'crtsh' and rel_type == 'crtsh_cert_issuer': - return NodeType.CA - - # ISP nodes from Shodan - if provider == 'shodan' and rel_type == 'shodan_isp': - return NodeType.ISP - - # Correlation objects - if rel_type.startswith('corr_'): - return NodeType.CORRELATION_OBJECT - - if source == node_id: - rel_type = edge_data.get('relationship_type', '') - provider = edge_data.get('source_provider', '') - - # Source nodes in cert issuer relationships are CAs - if provider == 'crtsh' and rel_type == 'crtsh_cert_issuer': - return NodeType.CA - - # Strategy 3: Format-based detection (fallback) - if _is_valid_ip(node_id): - return NodeType.IP - elif _is_valid_domain(node_id): - return NodeType.DOMAIN - - # Strategy 4: Check large entity context - if self.graph.has_node(large_entity_id): - large_entity_data = self.graph.nodes[large_entity_id] - attributes = large_entity_data.get('attributes', []) - - node_type_attr = next((attr for attr in attributes if attr.get('name') == 'node_type'), None) - if node_type_attr: - entity_node_type = node_type_attr.get('value', 'domain') - if entity_node_type == 'ip': - return NodeType.IP - else: - return NodeType.DOMAIN - - # Final fallback - return NodeType.DOMAIN def _update_session_state(self) -> None: """ Update the scanner state in Redis for GUI updates. diff --git a/static/js/graph.js b/static/js/graph.js index 17a411a..7087eb9 100644 --- a/static/js/graph.js +++ b/static/js/graph.js @@ -383,27 +383,6 @@ class GraphManager { } } - this.largeEntityMembers.clear(); - const largeEntityMap = new Map(); - - graphData.nodes.forEach(node => { - if (node.type === 'large_entity' && node.attributes) { - const nodesAttribute = this.findAttributeByName(node.attributes, 'nodes'); - if (nodesAttribute && Array.isArray(nodesAttribute.value)) { - nodesAttribute.value.forEach(nodeId => { - largeEntityMap.set(nodeId, node.id); - this.largeEntityMembers.add(nodeId); - }); - } - } - }); - - const filteredNodes = graphData.nodes.filter(node => { - return !this.largeEntityMembers.has(node.id) || node.type === 'large_entity'; - }); - - console.log(`Filtered ${graphData.nodes.length - filteredNodes.length} large entity member nodes from visualization`); - // Process nodes with proper certificate coloring const processedNodes = filteredNodes.map(node => { const processed = this.processNode(node); @@ -427,8 +406,6 @@ class GraphManager { const mergedEdges = {}; graphData.edges.forEach(edge => { - const fromNode = largeEntityMap.has(edge.from) ? largeEntityMap.get(edge.from) : edge.from; - const toNode = largeEntityMap.has(edge.to) ? largeEntityMap.get(edge.to) : edge.to; const mergeKey = `${fromNode}-${toNode}-${edge.label}`; if (!mergedEdges[mergeKey]) { @@ -477,7 +454,6 @@ class GraphManager { } console.log(`Graph updated: ${processedNodes.length} nodes, ${processedEdges.length} edges (${newNodes.length} new nodes, ${newEdges.length} new edges)`); - console.log(`Large entity members hidden: ${this.largeEntityMembers.size}`); } catch (error) { console.error('Failed to update graph:', error); @@ -1053,7 +1029,7 @@ class GraphManager { this.nodes.clear(); this.edges.clear(); this.history = []; - this.largeEntityMembers.clear(); // Clear large entity tracking + this.largeEntityMembers.clear(); this.initialTargetIds.clear(); // Show placeholder @@ -1211,7 +1187,6 @@ class GraphManager { const basicStats = { nodeCount: this.nodes.length, edgeCount: this.edges.length, - largeEntityMembersHidden: this.largeEntityMembers.size }; // Add forensic statistics -- 2.39.5 From 0a6d12de9a2c2c50e234d804104c2210c9696287 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Fri, 19 Sep 2025 00:38:26 +0200 Subject: [PATCH 2/5] large entity recreation --- app.py | 13 +++--- core/scanner.py | 106 ++++++++++++++++++++++++++++++++++++++++++--- static/js/graph.js | 100 ++++++++++++++++++------------------------ 3 files changed, 147 insertions(+), 72 deletions(-) diff --git a/app.py b/app.py index 701f8e8..1dd1693 100644 --- a/app.py +++ b/app.py @@ -200,14 +200,13 @@ def extract_from_large_entity(): if not scanner: return jsonify({'success': False, 'error': 'No active session found'}), 404 - # TODO implement - #success = scanner.extract_node_from_large_entity(large_entity_id, node_id) + success = scanner.extract_node_from_large_entity(large_entity_id, node_id) - #if success: - # session_manager.update_session_scanner(user_session_id, scanner) - # return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'}) - #else: - # return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500 + if success: + session_manager.update_session_scanner(user_session_id, scanner) + return jsonify({'success': True, 'message': f'Node {node_id} extracted successfully.'}) + else: + return jsonify({'success': False, 'error': f'Failed to extract node {node_id}.'}), 500 except Exception as e: traceback.print_exc() diff --git a/core/scanner.py b/core/scanner.py index 43c9d9d..34e00d8 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -821,6 +821,88 @@ class Scanner: self._update_provider_state(target, provider_name, 'failed', 0, str(e), start_time) return None + def _create_large_entity_from_result(self, source_node: str, provider_name: str, + provider_result: ProviderResult, depth: int) -> Set[str]: + """ + Creates a large entity node and tags all member nodes. + """ + members = {rel.target_node for rel in provider_result.relationships + if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)} + + if not members: + return set() + + large_entity_id = f"le_{provider_name}_{source_node}" + + # Add the large entity node to the graph + self.graph.add_node( + node_id=large_entity_id, + node_type=NodeType.LARGE_ENTITY, + attributes=[ + {"name": "count", "value": len(members), "type": "statistic"}, + {"name": "source_provider", "value": provider_name, "type": "metadata"}, + {"name": "discovery_depth", "value": depth, "type": "metadata"}, + {"name": "nodes", "value": list(members), "type": "metadata"} + ], + description=f"A collection of {len(members)} nodes discovered from {source_node} via {provider_name}." + ) + + # Create a single edge from the source to the large entity + self.graph.add_edge( + source_node, large_entity_id, + relationship_type=f"{provider_name}_collection", + confidence_score=0.95, + source_provider=provider_name, + raw_data={'description': 'Represents a large collection of nodes.'} + ) + + # Tag each member node with the large entity ID + for member_id in members: + node_type = NodeType.IP if _is_valid_ip(member_id) else NodeType.DOMAIN + self.graph.add_node( + node_id=member_id, + node_type=node_type, + metadata={'large_entity_id': large_entity_id} + ) + + return members + + def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool: + """ + Removes a node from a large entity, allowing it to be processed normally. + """ + if not self.graph.graph.has_node(node_id): + return False + + node_data = self.graph.graph.nodes[node_id] + metadata = node_data.get('metadata', {}) + + if metadata.get('large_entity_id') == large_entity_id: + # Remove the large entity tag + del metadata['large_entity_id'] + self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata) + + # Re-enqueue the node for full processing + is_ip = _is_valid_ip(node_id) + eligible_providers = self._get_eligible_providers(node_id, is_ip, False) + for provider in eligible_providers: + provider_name = provider.get_name() + priority = self._get_priority(provider_name) + # Use current depth of the large entity if available, else 0 + depth = 0 + if self.graph.graph.has_node(large_entity_id): + le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', []) + depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None) + if depth_attr: + depth = depth_attr['value'] + + self.task_queue.put((time.time(), priority, (provider_name, node_id, depth))) + self.total_tasks_ever_enqueued += 1 + + return True + + return False + def _process_provider_result_unified(self, target: str, provider: BaseProvider, provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: """ @@ -830,7 +912,7 @@ class Scanner: """ provider_name = provider.get_name() discovered_targets = set() - #large_entity_members = set() + large_entity_members = set() if self._is_stop_requested(): return discovered_targets, False @@ -842,11 +924,11 @@ class Scanner: is_large_entity = eligible_relationship_count > self.config.large_entity_threshold - #if is_large_entity: + if is_large_entity: # Create the large entity node and get the set of its members - #large_entity_members = self._create_large_entity_from_result( - # target, provider_name, provider_result, current_depth - #) + large_entity_members = self._create_large_entity_from_result( + target, provider_name, provider_result, current_depth + ) # Process ALL relationships to build the complete underlying data model for i, relationship in enumerate(provider_result.relationships): @@ -885,7 +967,8 @@ class Scanner: # Add all discovered domains/IPs to be considered for further processing if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached: - discovered_targets.add(target_node) + if target_node not in large_entity_members: + discovered_targets.add(target_node) # Process all attributes and add them to the corresponding nodes attributes_by_node = defaultdict(list) @@ -1004,8 +1087,19 @@ class Scanner: eligible = [] target_key = 'ips' if is_ip else 'domains' + # Check if the target is part of a large entity + is_in_large_entity = False + if self.graph.graph.has_node(target): + metadata = self.graph.graph.nodes[target].get('metadata', {}) + if 'large_entity_id' in metadata: + is_in_large_entity = True + for provider in self.providers: try: + # If in large entity, only allow dns and correlation providers + if is_in_large_entity and provider.get_name() not in ['dns', 'correlation']: + continue + # Check if provider supports this target type if not provider.get_eligibility().get(target_key, False): continue diff --git a/static/js/graph.js b/static/js/graph.js index 7087eb9..9fb9188 100644 --- a/static/js/graph.js +++ b/static/js/graph.js @@ -1,3 +1,4 @@ +// dnsrecon-reduced/static/js/graph.js /** * Graph visualization module for DNSRecon * Handles network graph rendering using vis.js with proper large entity node hiding @@ -362,77 +363,60 @@ class GraphManager { } try { - // Initialize if not already done if (!this.isInitialized) { this.initialize(); } this.initialTargetIds = new Set(graphData.initial_targets || []); - // Check if we have actual data to display const hasData = graphData.nodes.length > 0 || graphData.edges.length > 0; - // Handle placeholder visibility const placeholder = this.container.querySelector('.graph-placeholder'); if (placeholder) { - if (hasData) { - placeholder.style.display = 'none'; - } else { - placeholder.style.display = 'flex'; - // Early return if no data to process - return; - } + placeholder.style.display = hasData ? 'none' : 'flex'; + } + if (!hasData) { + this.nodes.clear(); + this.edges.clear(); + return; } - // Process nodes with proper certificate coloring - const processedNodes = filteredNodes.map(node => { + const nodeMap = new Map(graphData.nodes.map(node => [node.id, node])); + + // Filter out hidden nodes before processing for rendering + const filteredNodes = graphData.nodes.filter(node => + !(node.metadata && node.metadata.large_entity_id) + ); + + const processedNodes = graphData.nodes.map(node => { const processed = this.processNode(node); - - // Apply certificate-based coloring here in frontend - if (node.type === 'domain' && Array.isArray(node.attributes)) { - const certInfo = this.analyzeCertificateInfo(node.attributes); - - if (certInfo.hasExpiredOnly) { - // Red for domains with only expired/invalid certificates - processed.color = { background: '#ff6b6b', border: '#cc5555' }; - } else if (!certInfo.hasCertificates) { - // Grey for domains with no certificates - processed.color = { background: '#c7c7c7', border: '#999999' }; - } - // Valid certificates use default green (handled by processNode) - } - - return processed; - }); - - const mergedEdges = {}; - graphData.edges.forEach(edge => { - const mergeKey = `${fromNode}-${toNode}-${edge.label}`; - - if (!mergedEdges[mergeKey]) { - mergedEdges[mergeKey] = { - ...edge, - from: fromNode, - to: toNode, - count: 0, - confidence_score: 0 - }; - } - - mergedEdges[mergeKey].count++; - if (edge.confidence_score > mergedEdges[mergeKey].confidence_score) { - mergedEdges[mergeKey].confidence_score = edge.confidence_score; - } - }); - - const processedEdges = Object.values(mergedEdges).map(edge => { - const processed = this.processEdge(edge); - if (edge.count > 1) { - processed.label = `${edge.label} (${edge.count})`; + if (node.metadata && node.metadata.large_entity_id) { + processed.hidden = true; } return processed; }); + + const processedEdges = graphData.edges.map(edge => { + let fromNode = nodeMap.get(edge.from); + let toNode = nodeMap.get(edge.to); + let fromId = edge.from; + let toId = edge.to; + + if (fromNode && fromNode.metadata && fromNode.metadata.large_entity_id) { + fromId = fromNode.metadata.large_entity_id; + } + if (toNode && toNode.metadata && toNode.metadata.large_entity_id) { + toId = toNode.metadata.large_entity_id; + } + + // Avoid self-referencing edges from re-routing + if (fromId === toId) { + return null; + } + + const reRoutedEdge = { ...edge, from: fromId, to: toId }; + return this.processEdge(reRoutedEdge); + }).filter(Boolean); // Remove nulls from self-referencing edges - // Update datasets with animation const existingNodeIds = this.nodes.getIds(); const existingEdgeIds = this.edges.getIds(); @@ -449,11 +433,9 @@ class GraphManager { setTimeout(() => this.highlightNewElements(newNodes, newEdges), 100); } - if (processedNodes.length <= 10 || existingNodeIds.length === 0) { + if (this.nodes.length <= 10 || existingNodeIds.length === 0) { setTimeout(() => this.fitView(), 800); } - - console.log(`Graph updated: ${processedNodes.length} nodes, ${processedEdges.length} edges (${newNodes.length} new nodes, ${newEdges.length} new edges)`); } catch (error) { console.error('Failed to update graph:', error); @@ -582,7 +564,7 @@ class GraphManager { processEdge(edge) { const confidence = edge.confidence_score || 0; const processedEdge = { - id: `${edge.from}-${edge.to}`, + id: `${edge.from}-${edge.to}-${edge.label}`, from: edge.from, to: edge.to, label: this.formatEdgeLabel(edge.label, confidence), -- 2.39.5 From eabb532557fcb82c0d779f9f77e386df5a70bb29 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Fri, 19 Sep 2025 01:10:07 +0200 Subject: [PATCH 3/5] almost fixed --- core/scanner.py | 103 +++++++++++++++++++++++---------------------- static/js/graph.js | 16 ++++--- static/js/main.js | 2 - 3 files changed, 63 insertions(+), 58 deletions(-) diff --git a/core/scanner.py b/core/scanner.py index 34e00d8..728f602 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -761,37 +761,37 @@ class Scanner: def _process_provider_task(self, provider: BaseProvider, target: str, depth: int) -> Tuple[Set[str], Set[str], bool]: """ Manages the entire process for a given target and provider. - FIXED: Don't enqueue correlation tasks during normal processing. + This version is generalized to handle all relationships dynamically. """ if self._is_stop_requested(): return set(), set(), False - + is_ip = _is_valid_ip(target) target_type = NodeType.IP if is_ip else NodeType.DOMAIN - + self.graph.add_node(target, target_type) self._initialize_provider_states(target) - + new_targets = set() - large_entity_members = set() provider_successful = True - + try: provider_result = self._execute_provider_query(provider, target, is_ip) - + if provider_result is None: provider_successful = False elif not self._is_stop_requested(): + # Pass all relationships to be processed discovered, is_large_entity = self._process_provider_result_unified( target, provider, provider_result, depth ) new_targets.update(discovered) - + except Exception as e: provider_successful = False self._log_provider_error(target, provider.get_name(), str(e)) - - return new_targets, large_entity_members, provider_successful + + return new_targets, set(), provider_successful def _execute_provider_query(self, provider: BaseProvider, target: str, is_ip: bool) -> Optional[ProviderResult]: """ @@ -822,19 +822,18 @@ class Scanner: return None def _create_large_entity_from_result(self, source_node: str, provider_name: str, - provider_result: ProviderResult, depth: int) -> Set[str]: + provider_result: ProviderResult, depth: int) -> Tuple[str, Set[str]]: """ - Creates a large entity node and tags all member nodes. + Creates a large entity node, tags all member nodes, and returns its ID and members. """ members = {rel.target_node for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)} if not members: - return set() + return "", set() large_entity_id = f"le_{provider_name}_{source_node}" - # Add the large entity node to the graph self.graph.add_node( node_id=large_entity_id, node_type=NodeType.LARGE_ENTITY, @@ -847,16 +846,6 @@ class Scanner: description=f"A collection of {len(members)} nodes discovered from {source_node} via {provider_name}." ) - # Create a single edge from the source to the large entity - self.graph.add_edge( - source_node, large_entity_id, - relationship_type=f"{provider_name}_collection", - confidence_score=0.95, - source_provider=provider_name, - raw_data={'description': 'Represents a large collection of nodes.'} - ) - - # Tag each member node with the large entity ID for member_id in members: node_type = NodeType.IP if _is_valid_ip(member_id) else NodeType.DOMAIN self.graph.add_node( @@ -865,7 +854,7 @@ class Scanner: metadata={'large_entity_id': large_entity_id} ) - return members + return large_entity_id, members def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool: """ @@ -907,70 +896,83 @@ class Scanner: provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: """ Process a unified ProviderResult object to update the graph. - Handles large entity creation while ensuring all underlying nodes and edges are - added to the graph data model for a complete dataset. + This version dynamically re-routes edges to a large entity container. """ provider_name = provider.get_name() discovered_targets = set() + large_entity_id = "" large_entity_members = set() if self._is_stop_requested(): return discovered_targets, False - # Check if a large entity should be created based on the count of domain/IP relationships - eligible_relationship_count = sum( + eligible_rel_count = sum( 1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node) ) - - is_large_entity = eligible_relationship_count > self.config.large_entity_threshold + is_large_entity = eligible_rel_count > self.config.large_entity_threshold if is_large_entity: - # Create the large entity node and get the set of its members - large_entity_members = self._create_large_entity_from_result( + large_entity_id, large_entity_members = self._create_large_entity_from_result( target, provider_name, provider_result, current_depth ) - # Process ALL relationships to build the complete underlying data model for i, relationship in enumerate(provider_result.relationships): if i % 5 == 0 and self._is_stop_requested(): break - source_node = relationship.source_node - target_node = relationship.target_node + source_node_id = relationship.source_node + target_node_id = relationship.target_node - # Determine node types - source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN + # Determine visual source and target, substituting with large entity ID if necessary + visual_source = large_entity_id if source_node_id in large_entity_members else source_node_id + visual_target = large_entity_id if target_node_id in large_entity_members else target_node_id + + # Prevent self-loops on the large entity node + if visual_source == visual_target: + continue + + # Determine node types for the actual nodes + source_type = NodeType.IP if _is_valid_ip(source_node_id) else NodeType.DOMAIN if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp': target_type = NodeType.ISP elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer': target_type = NodeType.CA elif provider_name == 'correlation': target_type = NodeType.CORRELATION_OBJECT - elif _is_valid_ip(target_node): + elif _is_valid_ip(target_node_id): target_type = NodeType.IP else: target_type = NodeType.DOMAIN max_depth_reached = current_depth >= self.max_depth - # Add all nodes and edges to the graph's data model. - # The frontend will handle the visual re-routing for large entity members. - self.graph.add_node(source_node, source_type) - self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached}) + # Add actual nodes to the graph (they might be hidden by the UI) + self.graph.add_node(source_node_id, source_type) + self.graph.add_node(target_node_id, target_type, metadata={'max_depth_reached': max_depth_reached}) + + # Add the visual edge to the graph self.graph.add_edge( - source_node, target_node, + visual_source, visual_target, relationship.relationship_type, relationship.confidence, provider_name, relationship.raw_data ) + + if (_is_valid_domain(target_node_id) or _is_valid_ip(target_node_id)) and not max_depth_reached: + if target_node_id not in large_entity_members: + discovered_targets.add(target_node_id) - # Add all discovered domains/IPs to be considered for further processing - if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached: - if target_node not in large_entity_members: - discovered_targets.add(target_node) - - # Process all attributes and add them to the corresponding nodes + if large_entity_members: + self.logger.logger.info(f"Enqueuing DNS and Correlation for {len(large_entity_members)} members of {large_entity_id}") + for member in large_entity_members: + for provider_name_to_run in ['dns', 'correlation']: + p_instance = next((p for p in self.providers if p.get_name() == provider_name_to_run), None) + if p_instance and p_instance.get_eligibility().get('domains' if _is_valid_domain(member) else 'ips'): + priority = self._get_priority(provider_name_to_run) + self.task_queue.put((time.time(), priority, (provider_name_to_run, member, current_depth))) + self.total_tasks_ever_enqueued += 1 + attributes_by_node = defaultdict(list) for attribute in provider_result.attributes: attr_dict = { @@ -989,7 +991,6 @@ class Scanner: return discovered_targets, is_large_entity - def stop_scan(self) -> bool: """Request immediate scan termination with proper cleanup.""" try: diff --git a/static/js/graph.js b/static/js/graph.js index 9fb9188..3fb5216 100644 --- a/static/js/graph.js +++ b/static/js/graph.js @@ -1565,14 +1565,20 @@ class GraphManager { } /** - * Unhide all hidden nodes + * Unhide all hidden nodes, excluding those within a large entity. */ unhideAll() { - const allNodes = this.nodes.get({ - filter: (node) => node.hidden === true + const allHiddenNodes = this.nodes.get({ + filter: (node) => { + // Condition: Node is hidden AND it is NOT part of a large entity. + return node.hidden === true && !(node.metadata && node.metadata.large_entity_id); + } }); - const updates = allNodes.map(node => ({ id: node.id, hidden: false })); - this.nodes.update(updates); + + if (allHiddenNodes.length > 0) { + const updates = allHiddenNodes.map(node => ({ id: node.id, hidden: false })); + this.nodes.update(updates); + } } } diff --git a/static/js/main.js b/static/js/main.js index c318aae..c073f7b 100644 --- a/static/js/main.js +++ b/static/js/main.js @@ -1997,8 +1997,6 @@ class DNSReconApp { if (response.success) { this.showSuccess(response.message); - this.hideModal(); - // If the scanner was idle, it's now running. Start polling to see the new node appear. if (this.scanStatus === 'idle') { this.startPolling(1000); -- 2.39.5 From 7472e6f416bbf8ca8b69f76b86697804e619f018 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Fri, 19 Sep 2025 12:35:28 +0200 Subject: [PATCH 4/5] fixes to hint for incomplete data --- providers/crtsh_provider.py | 41 +++++++++++++++++++++++++++++++-- static/js/graph.js | 29 ++++++++++++++++++++--- static/js/main.js | 46 ++++++++++++++++++++++++++++++++----- 3 files changed, 105 insertions(+), 11 deletions(-) diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py index 731cfd2..e4bda95 100644 --- a/providers/crtsh_provider.py +++ b/providers/crtsh_provider.py @@ -3,7 +3,7 @@ import json import re from pathlib import Path -from typing import List, Dict, Any, Set +from typing import List, Dict, Any, Set, Optional from urllib.parse import quote from datetime import datetime, timezone import requests @@ -285,6 +285,17 @@ class CrtShProvider(BaseProvider): if self._stop_event and self._stop_event.is_set(): self.logger.logger.info(f"CrtSh processing cancelled before processing for domain: {query_domain}") return result + + incompleteness_warning = self._check_for_incomplete_data(query_domain, certificates) + if incompleteness_warning: + result.add_attribute( + target_node=query_domain, + name="crtsh_data_warning", + value=incompleteness_warning, + attr_type='metadata', + provider=self.name, + confidence=1.0 + ) all_discovered_domains = set() processed_issuers = set() @@ -577,4 +588,30 @@ class CrtShProvider(BaseProvider): elif query_domain.endswith(f'.{cert_domain}'): return 'parent_domain' else: - return 'related_domain' \ No newline at end of file + return 'related_domain' + + def _check_for_incomplete_data(self, domain: str, certificates: List[Dict[str, Any]]) -> Optional[str]: + """ + Analyzes the certificate list to heuristically detect if the data from crt.sh is incomplete. + """ + cert_count = len(certificates) + + # Heuristic 1: Check if the number of certs hits a known hard limit. + if cert_count >= 10000: + return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit." + + # Heuristic 2: Check if all returned certificates are old. + if cert_count > 1000: # Only apply this for a reasonable number of certs + latest_expiry = None + for cert in certificates: + try: + not_after = self._parse_certificate_date(cert.get('not_after')) + if latest_expiry is None or not_after > latest_expiry: + latest_expiry = not_after + except (ValueError, TypeError): + continue + + if latest_expiry and (datetime.now(timezone.utc) - latest_expiry).days > 365: + return f"Incomplete data suspected: The latest certificate expired more than a year ago ({latest_expiry.strftime('%Y-%m-%d')})." + + return None \ No newline at end of file diff --git a/static/js/graph.js b/static/js/graph.js index 3fb5216..89efeae 100644 --- a/static/js/graph.js +++ b/static/js/graph.js @@ -1565,19 +1565,42 @@ class GraphManager { } /** - * Unhide all hidden nodes, excluding those within a large entity. + * FIXED: Unhide all hidden nodes, excluding large entity members and disconnected nodes. + * This prevents orphaned large entity members from appearing as free-floating nodes. */ unhideAll() { const allHiddenNodes = this.nodes.get({ filter: (node) => { - // Condition: Node is hidden AND it is NOT part of a large entity. - return node.hidden === true && !(node.metadata && node.metadata.large_entity_id); + // Skip nodes that are part of a large entity + if (node.metadata && node.metadata.large_entity_id) { + return false; + } + + // Skip nodes that are not hidden + if (node.hidden !== true) { + return false; + } + + // Skip nodes that have no edges (would appear disconnected) + const nodeId = node.id; + const hasIncomingEdges = this.edges.get().some(edge => edge.to === nodeId && !edge.hidden); + const hasOutgoingEdges = this.edges.get().some(edge => edge.from === nodeId && !edge.hidden); + + if (!hasIncomingEdges && !hasOutgoingEdges) { + console.log(`Skipping disconnected node ${nodeId} from unhide`); + return false; + } + + return true; } }); if (allHiddenNodes.length > 0) { + console.log(`Unhiding ${allHiddenNodes.length} nodes with valid connections`); const updates = allHiddenNodes.map(node => ({ id: node.id, hidden: false })); this.nodes.update(updates); + } else { + console.log('No eligible nodes to unhide'); } } diff --git a/static/js/main.js b/static/js/main.js index c073f7b..cc0c148 100644 --- a/static/js/main.js +++ b/static/js/main.js @@ -1397,28 +1397,62 @@ class DNSReconApp { } /** - * UPDATED: Generate details for standard nodes with organized attribute grouping + * UPDATED: Generate details for standard nodes with organized attribute grouping and data warnings */ generateStandardNodeDetails(node) { let html = ''; - + + // Check for and display a crt.sh data warning if it exists + const crtshWarningAttr = this.findAttributeByName(node.attributes, 'crtsh_data_warning'); + if (crtshWarningAttr) { + html += ` + + `; + } + // Relationships sections html += this.generateRelationshipsSection(node); - + // UPDATED: Enhanced attributes section with intelligent grouping (no formatting) if (node.attributes && Array.isArray(node.attributes) && node.attributes.length > 0) { html += this.generateOrganizedAttributesSection(node.attributes, node.type); } - + // Description section html += this.generateDescriptionSection(node); - + // Metadata section (collapsed by default) html += this.generateMetadataSection(node); - + return html; } + /** + * Helper method to find an attribute by name in the standardized attributes list + * @param {Array} attributes - List of StandardAttribute objects + * @param {string} name - Attribute name to find + * @returns {Object|null} The attribute object if found, null otherwise + */ + findAttributeByName(attributes, name) { + if (!Array.isArray(attributes)) { + return null; + } + return attributes.find(attr => attr.name === name) || null; + } + generateOrganizedAttributesSection(attributes, nodeType) { if (!Array.isArray(attributes) || attributes.length === 0) { return ''; -- 2.39.5 From 8d402ab4b1155874c753a49a233a752f08675763 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Fri, 19 Sep 2025 14:28:37 +0200 Subject: [PATCH 5/5] postgres --- providers/crtsh_provider.py | 84 +++++++++++++++++++++++++++++++++++-- requirements.txt | 3 +- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py index e4bda95..c946c74 100644 --- a/providers/crtsh_provider.py +++ b/providers/crtsh_provider.py @@ -2,15 +2,37 @@ import json import re +import psycopg2 from pathlib import Path from typing import List, Dict, Any, Set, Optional from urllib.parse import quote from datetime import datetime, timezone import requests +from psycopg2 import pool from .base_provider import BaseProvider from core.provider_result import ProviderResult from utils.helpers import _is_valid_domain +from core.logger import get_forensic_logger + +# --- Global Instance for PostgreSQL Connection Pool --- +# This pool will be created once per worker process and is not part of the +# CrtShProvider instance, thus avoiding pickling errors. +db_pool = None +try: + db_pool = psycopg2.pool.SimpleConnectionPool( + 1, 5, + host='crt.sh', + port=5432, + user='guest', + dbname='certwatch', + sslmode='prefer', + connect_timeout=60 + ) + # Use a generic logger here as this is at the module level + get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.") +except Exception as e: + get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.") class CrtShProvider(BaseProvider): @@ -37,7 +59,7 @@ class CrtShProvider(BaseProvider): # Compile regex for date filtering for efficiency self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}') - + def get_name(self) -> str: """Return the provider name.""" return "crtsh" @@ -121,7 +143,7 @@ class CrtShProvider(BaseProvider): else: # "stale" or "not_found" # Query the API for the latest certificates - new_raw_certs = self._query_crtsh_api(domain) + new_raw_certs = self._query_crtsh(domain) if self._stop_event and self._stop_event.is_set(): return ProviderResult() @@ -152,8 +174,8 @@ class CrtShProvider(BaseProvider): # Save the new result and the raw data to the cache self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) - except requests.exceptions.RequestException as e: - self.logger.logger.error(f"API query failed for {domain}: {e}") + except (requests.exceptions.RequestException, psycopg2.Error) as e: + self.logger.logger.error(f"Upstream query failed for {domain}: {e}") if cache_status != "not_found": result = self._load_from_cache(cache_file) self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.") @@ -255,6 +277,58 @@ class CrtShProvider(BaseProvider): json.dump(cache_data, f, separators=(',', ':'), default=str) except Exception as e: self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}") + + def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]: + """Query crt.sh, trying the database first and falling back to the API.""" + global db_pool + if db_pool: + try: + self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}") + return self._query_crtsh_db(domain) + except psycopg2.Error as e: + self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.") + return self._query_crtsh_api(domain) + else: + self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}") + return self._query_crtsh_api(domain) + + def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]: + """Query crt.sh database for raw certificate data.""" + global db_pool + conn = db_pool.getconn() + try: + with conn.cursor() as cursor: + query = """ + SELECT + c.id, + x509_serialnumber(c.certificate) as serial_number, + x509_notbefore(c.certificate) as not_before, + x509_notafter(c.certificate) as not_after, + c.issuer_ca_id, + ca.name as issuer_name, + x509_commonname(c.certificate) as common_name, + identities(c.certificate)::text as name_value + FROM certificate c + LEFT JOIN ca ON c.issuer_ca_id = ca.id + WHERE identities(c.certificate) @@ plainto_tsquery(%s) + ORDER BY c.id DESC + LIMIT 5000; + """ + cursor.execute(query, (domain,)) + + results = [] + columns = [desc[0] for desc in cursor.description] + for row in cursor.fetchall(): + row_dict = dict(zip(columns, row)) + if row_dict.get('not_before'): + row_dict['not_before'] = row_dict['not_before'].isoformat() + if row_dict.get('not_after'): + row_dict['not_after'] = row_dict['not_after'].isoformat() + results.append(row_dict) + self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.") + return results + finally: + db_pool.putconn(conn) def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]: """Query crt.sh API for raw certificate data.""" @@ -468,6 +542,8 @@ class CrtShProvider(BaseProvider): raise ValueError("Empty date string") try: + if isinstance(date_string, datetime): + return date_string.replace(tzinfo=timezone.utc) if date_string.endswith('Z'): return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc) elif '+' in date_string or date_string.endswith('UTC'): diff --git a/requirements.txt b/requirements.txt index d46c0bc..4ec5adb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ urllib3 dnspython gunicorn redis -python-dotenv \ No newline at end of file +python-dotenv +psycopg2-binary \ No newline at end of file -- 2.39.5