From bcd79ae2f5f25af3d2c5f9e338727e74e8041209 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Sat, 20 Sep 2025 18:19:10 +0200 Subject: [PATCH] some fixes for UX, correlation engine --- core/scanner.py | 9 +- providers/correlation_provider.py | 1 + providers/crtsh_provider.py | 184 ++++++++---------------------- static/css/main.css | 1 + static/js/main.js | 27 ++++- 5 files changed, 76 insertions(+), 146 deletions(-) diff --git a/core/scanner.py b/core/scanner.py index 728f602..41b8a96 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -605,11 +605,6 @@ class Scanner: processing_key = (provider_name, target_item) self.currently_processing.discard(processing_key) - # PHASE 2: Run correlations on all discovered nodes - if not self._is_stop_requested(): - print(f"\n=== PHASE 2: Running correlation analysis ===") - self._run_correlation_phase(max_depth, processed_tasks) - except Exception as e: traceback.print_exc() self.status = ScanStatus.FAILED @@ -633,6 +628,10 @@ class Scanner: else: self.status = ScanStatus.COMPLETED + if self.status in [ScanStatus.COMPLETED, ScanStatus.STOPPED]: + print(f"\n=== PHASE 2: Running correlation analysis ===") + self._run_correlation_phase(max_depth, processed_tasks) + self.status_logger_stop_event.set() if self.status_logger_thread and self.status_logger_thread.is_alive(): self.status_logger_thread.join(timeout=2.0) diff --git a/providers/correlation_provider.py b/providers/correlation_provider.py index 35bb3d4..9ae6eeb 100644 --- a/providers/correlation_provider.py +++ b/providers/correlation_provider.py @@ -27,6 +27,7 @@ class CorrelationProvider(BaseProvider): 'cert_validity_period_days', 'cert_issuer_name', 'cert_entry_timestamp', + 'cert_serial_number', # useless 'cert_not_before', 'cert_not_after', 'dns_ttl', diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py index c946c74..d2692a9 100644 --- a/providers/crtsh_provider.py +++ b/providers/crtsh_provider.py @@ -2,43 +2,22 @@ import json import re -import psycopg2 from pathlib import Path from typing import List, Dict, Any, Set, Optional from urllib.parse import quote from datetime import datetime, timezone import requests -from psycopg2 import pool from .base_provider import BaseProvider from core.provider_result import ProviderResult from utils.helpers import _is_valid_domain from core.logger import get_forensic_logger -# --- Global Instance for PostgreSQL Connection Pool --- -# This pool will be created once per worker process and is not part of the -# CrtShProvider instance, thus avoiding pickling errors. -db_pool = None -try: - db_pool = psycopg2.pool.SimpleConnectionPool( - 1, 5, - host='crt.sh', - port=5432, - user='guest', - dbname='certwatch', - sslmode='prefer', - connect_timeout=60 - ) - # Use a generic logger here as this is at the module level - get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.") -except Exception as e: - get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.") - - class CrtShProvider(BaseProvider): """ Provider for querying crt.sh certificate transparency database. FIXED: Now properly creates domain and CA nodes instead of large entities. + REMOVED: All PostgreSQL logic to rely exclusively on the HTTP API for stability. Returns standardized ProviderResult objects with caching support. """ @@ -53,7 +32,7 @@ class CrtShProvider(BaseProvider): self.base_url = "https://crt.sh/" self._stop_event = None - # Initialize cache directory (separate from BaseProvider's HTTP cache) + # Initialize cache directory self.domain_cache_dir = Path('cache') / 'crtsh' self.domain_cache_dir.mkdir(parents=True, exist_ok=True) @@ -116,8 +95,7 @@ class CrtShProvider(BaseProvider): def query_domain(self, domain: str) -> ProviderResult: """ - FIXED: Query crt.sh for certificates containing the domain. - Now properly creates domain and CA nodes instead of large entities. + Query crt.sh for certificates containing the domain via HTTP API. Args: domain: Domain to investigate @@ -140,60 +118,56 @@ class CrtShProvider(BaseProvider): if cache_status == "fresh": result = self._load_from_cache(cache_file) self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}") + return result - else: # "stale" or "not_found" - # Query the API for the latest certificates - new_raw_certs = self._query_crtsh(domain) + # For "stale" or "not_found", we must query the API. + new_raw_certs = self._query_crtsh_api(domain) + + if self._stop_event and self._stop_event.is_set(): + return ProviderResult() + + # Combine with old data if cache is stale + if cache_status == "stale": + old_raw_certs = self._load_raw_data_from_cache(cache_file) + combined_certs = old_raw_certs + new_raw_certs - if self._stop_event and self._stop_event.is_set(): - return ProviderResult() - - # Combine with old data if cache is stale - if cache_status == "stale": - old_raw_certs = self._load_raw_data_from_cache(cache_file) - combined_certs = old_raw_certs + new_raw_certs - - # Deduplicate the combined list - seen_ids = set() - unique_certs = [] - for cert in combined_certs: - cert_id = cert.get('id') - if cert_id not in seen_ids: - unique_certs.append(cert) - seen_ids.add(cert_id) - - raw_certificates_to_process = unique_certs - self.logger.logger.info(f"Refreshed and merged cache for {domain}. Total unique certs: {len(raw_certificates_to_process)}") - else: # "not_found" - raw_certificates_to_process = new_raw_certs + # Deduplicate the combined list + seen_ids = set() + unique_certs = [] + for cert in combined_certs: + cert_id = cert.get('id') + if cert_id not in seen_ids: + unique_certs.append(cert) + seen_ids.add(cert_id) - # FIXED: Process certificates to create proper domain and CA nodes - result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process) - self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)") + raw_certificates_to_process = unique_certs + self.logger.logger.info(f"Refreshed and merged cache for {domain}. Total unique certs: {len(raw_certificates_to_process)}") + else: # "not_found" + raw_certificates_to_process = new_raw_certs + + result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process) + self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)") - # Save the new result and the raw data to the cache - self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) + # Save the new result and the raw data to the cache + self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) - except (requests.exceptions.RequestException, psycopg2.Error) as e: + except requests.exceptions.RequestException as e: self.logger.logger.error(f"Upstream query failed for {domain}: {e}") - if cache_status != "not_found": - result = self._load_from_cache(cache_file) - self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.") - else: - raise e # Re-raise if there's no cache to fall back on + # **BUG FIX:** Always re-raise the exception. This signals a failure to the + # scanner, allowing its retry logic to handle the transient error. + raise e return result def query_ip(self, ip: str) -> ProviderResult: """ - Query crt.sh for certificates containing the IP address. - Note: crt.sh doesn't typically index by IP, so this returns empty results. + crt.sh does not support IP-based certificate queries effectively via its API. Args: ip: IP address to investigate Returns: - Empty ProviderResult (crt.sh doesn't support IP-based certificate queries effectively) + Empty ProviderResult """ return ProviderResult() @@ -277,59 +251,7 @@ class CrtShProvider(BaseProvider): json.dump(cache_data, f, separators=(',', ':'), default=str) except Exception as e: self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}") - - def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]: - """Query crt.sh, trying the database first and falling back to the API.""" - global db_pool - if db_pool: - try: - self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}") - return self._query_crtsh_db(domain) - except psycopg2.Error as e: - self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.") - return self._query_crtsh_api(domain) - else: - self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}") - return self._query_crtsh_api(domain) - - def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]: - """Query crt.sh database for raw certificate data.""" - global db_pool - conn = db_pool.getconn() - try: - with conn.cursor() as cursor: - query = """ - SELECT - c.id, - x509_serialnumber(c.certificate) as serial_number, - x509_notbefore(c.certificate) as not_before, - x509_notafter(c.certificate) as not_after, - c.issuer_ca_id, - ca.name as issuer_name, - x509_commonname(c.certificate) as common_name, - identities(c.certificate)::text as name_value - FROM certificate c - LEFT JOIN ca ON c.issuer_ca_id = ca.id - WHERE identities(c.certificate) @@ plainto_tsquery(%s) - ORDER BY c.id DESC - LIMIT 5000; - """ - cursor.execute(query, (domain,)) - - results = [] - columns = [desc[0] for desc in cursor.description] - for row in cursor.fetchall(): - row_dict = dict(zip(columns, row)) - if row_dict.get('not_before'): - row_dict['not_before'] = row_dict['not_before'].isoformat() - if row_dict.get('not_after'): - row_dict['not_after'] = row_dict['not_after'].isoformat() - results.append(row_dict) - self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.") - return results - finally: - db_pool.putconn(conn) - + def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]: """Query crt.sh API for raw certificate data.""" url = f"{self.base_url}?q={quote(domain)}&output=json" @@ -351,8 +273,7 @@ class CrtShProvider(BaseProvider): def _process_certificates_to_result_fixed(self, query_domain: str, certificates: List[Dict[str, Any]]) -> ProviderResult: """ - FIXED: Process certificates to create proper domain and CA nodes. - Now creates individual domain nodes instead of large entities. + Process certificates to create proper domain and CA nodes. """ result = ProviderResult() @@ -375,6 +296,7 @@ class CrtShProvider(BaseProvider): processed_issuers = set() for i, cert_data in enumerate(certificates): + # Check for stop event inside the loop to make it responsive. if i % 10 == 0 and self._stop_event and self._stop_event.is_set(): self.logger.logger.info(f"CrtSh processing cancelled at certificate {i} for domain: {query_domain}") break @@ -383,10 +305,9 @@ class CrtShProvider(BaseProvider): cert_domains = self._extract_domains_from_certificate(cert_data) all_discovered_domains.update(cert_domains) - # FIXED: Create CA nodes for certificate issuers (not as domain metadata) + # Create CA nodes for certificate issuers issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', '')) if issuer_name and issuer_name not in processed_issuers: - # Create relationship from query domain to CA result.add_relationship( source_node=query_domain, target_node=issuer_name, @@ -403,7 +324,6 @@ class CrtShProvider(BaseProvider): if not _is_valid_domain(cert_domain): continue - # Add certificate attributes to the domain for key, value in cert_metadata.items(): if value is not None: result.add_attribute( @@ -415,13 +335,13 @@ class CrtShProvider(BaseProvider): confidence=0.9, metadata={'certificate_id': cert_data.get('id')} ) - + + # Check for stop event before creating final relationships. if self._stop_event and self._stop_event.is_set(): self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}") return result - # FIXED: Create selective relationships to avoid large entities - # Only create relationships to domains that are closely related + # Create selective relationships to avoid large entities for discovered_domain in all_discovered_domains: if discovered_domain == query_domain: continue @@ -429,8 +349,6 @@ class CrtShProvider(BaseProvider): if not _is_valid_domain(discovered_domain): continue - # FIXED: Only create relationships for domains that share a meaningful connection - # This prevents creating too many relationships that trigger large entity creation if self._should_create_relationship(query_domain, discovered_domain): confidence = self._calculate_domain_relationship_confidence( query_domain, discovered_domain, [], all_discovered_domains @@ -459,18 +377,14 @@ class CrtShProvider(BaseProvider): def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool: """ - FIXED: Determine if a relationship should be created between two domains. - This helps avoid creating too many relationships that trigger large entity creation. + Determine if a relationship should be created between two domains. """ - # Always create relationships for subdomains if target_domain.endswith(f'.{source_domain}') or source_domain.endswith(f'.{target_domain}'): return True - # Create relationships for domains that share a common parent (up to 2 levels) source_parts = source_domain.split('.') target_parts = target_domain.split('.') - # Check if they share the same root domain (last 2 parts) if len(source_parts) >= 2 and len(target_parts) >= 2: source_root = '.'.join(source_parts[-2:]) target_root = '.'.join(target_parts[-2:]) @@ -504,7 +418,6 @@ class CrtShProvider(BaseProvider): metadata['is_currently_valid'] = self._is_cert_valid(cert_data) metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30 - # Keep raw date format or convert to standard format metadata['not_before'] = not_before.isoformat() metadata['not_after'] = not_after.isoformat() @@ -586,14 +499,12 @@ class CrtShProvider(BaseProvider): """Extract all domains from certificate data.""" domains = set() - # Extract from common name common_name = cert_data.get('common_name', '') if common_name: cleaned_cn = self._clean_domain_name(common_name) if cleaned_cn: domains.update(cleaned_cn) - # Extract from name_value field (contains SANs) name_value = cert_data.get('name_value', '') if name_value: for line in name_value.split('\n'): @@ -640,7 +551,6 @@ class CrtShProvider(BaseProvider): """Calculate confidence score for domain relationship based on various factors.""" base_confidence = 0.9 - # Adjust confidence based on domain relationship context relationship_context = self._determine_relationship_context(domain2, domain1) if relationship_context == 'exact_match': @@ -672,12 +582,10 @@ class CrtShProvider(BaseProvider): """ cert_count = len(certificates) - # Heuristic 1: Check if the number of certs hits a known hard limit. if cert_count >= 10000: return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit." - # Heuristic 2: Check if all returned certificates are old. - if cert_count > 1000: # Only apply this for a reasonable number of certs + if cert_count > 1000: latest_expiry = None for cert in certificates: try: diff --git a/static/css/main.css b/static/css/main.css index 1d68ffd..7327d87 100644 --- a/static/css/main.css +++ b/static/css/main.css @@ -474,6 +474,7 @@ input[type="text"]:focus, select:focus { flex-wrap: wrap; gap: 0.75rem; align-items: center; + max-height: 3rem; } .legend-item { diff --git a/static/js/main.js b/static/js/main.js index cc0c148..1d4b63a 100644 --- a/static/js/main.js +++ b/static/js/main.js @@ -329,7 +329,7 @@ class DNSReconApp { console.log(`Scan started for ${target} with depth ${maxDepth}`); // Start polling immediately with faster interval for responsiveness - this.startPolling(1000); + this.startPolling(); // Force an immediate status update console.log('Forcing immediate status update...'); @@ -738,7 +738,7 @@ class DNSReconApp { this.setUIState('scanning', task_queue_size); this.showSuccess('Scan is running'); // Increase polling frequency for active scans - this.startPolling(1000); // Poll every 1 second for running scans + this.startPolling(5000); // Poll every 5 second for running scans this.updateConnectionStatus('active'); break; @@ -2030,7 +2030,7 @@ class DNSReconApp { if (response.success) { this.showSuccess(response.message); - + // If the scanner was idle, it's now running. Start polling to see the new node appear. if (this.scanStatus === 'idle') { this.startPolling(1000); @@ -2039,6 +2039,27 @@ class DNSReconApp { setTimeout(() => this.updateGraph(), 500); } + // Immediately update the modal view + if (this.graphManager) { + const largeEntityNode = this.graphManager.nodes.get(largeEntityId); + if (largeEntityNode && largeEntityNode.attributes) { + + // Find and update the 'nodes' attribute + const nodesAttribute = largeEntityNode.attributes.find(attr => attr.name === 'nodes'); + if (nodesAttribute && Array.isArray(nodesAttribute.value)) { + nodesAttribute.value = nodesAttribute.value.filter(id => id !== nodeId); + } + + // Find and update the 'count' attribute + const countAttribute = largeEntityNode.attributes.find(attr => attr.name === 'count'); + if (countAttribute) { + countAttribute.value = (countAttribute.value || 0) - 1; + } + + // Re-render the modal with the updated data + this.showNodeModal(largeEntityNode); + } + } } else { throw new Error(response.error || 'Extraction failed on the server.'); }