some fixes for UX, correlation engine

This commit is contained in:
overcuriousity 2025-09-20 18:19:10 +02:00
parent 3ee23c9d05
commit bcd79ae2f5
5 changed files with 76 additions and 146 deletions

View File

@ -605,11 +605,6 @@ class Scanner:
processing_key = (provider_name, target_item) processing_key = (provider_name, target_item)
self.currently_processing.discard(processing_key) self.currently_processing.discard(processing_key)
# PHASE 2: Run correlations on all discovered nodes
if not self._is_stop_requested():
print(f"\n=== PHASE 2: Running correlation analysis ===")
self._run_correlation_phase(max_depth, processed_tasks)
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
self.status = ScanStatus.FAILED self.status = ScanStatus.FAILED
@ -633,6 +628,10 @@ class Scanner:
else: else:
self.status = ScanStatus.COMPLETED self.status = ScanStatus.COMPLETED
if self.status in [ScanStatus.COMPLETED, ScanStatus.STOPPED]:
print(f"\n=== PHASE 2: Running correlation analysis ===")
self._run_correlation_phase(max_depth, processed_tasks)
self.status_logger_stop_event.set() self.status_logger_stop_event.set()
if self.status_logger_thread and self.status_logger_thread.is_alive(): if self.status_logger_thread and self.status_logger_thread.is_alive():
self.status_logger_thread.join(timeout=2.0) self.status_logger_thread.join(timeout=2.0)

View File

@ -27,6 +27,7 @@ class CorrelationProvider(BaseProvider):
'cert_validity_period_days', 'cert_validity_period_days',
'cert_issuer_name', 'cert_issuer_name',
'cert_entry_timestamp', 'cert_entry_timestamp',
'cert_serial_number', # useless
'cert_not_before', 'cert_not_before',
'cert_not_after', 'cert_not_after',
'dns_ttl', 'dns_ttl',

View File

@ -2,43 +2,22 @@
import json import json
import re import re
import psycopg2
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any, Set, Optional from typing import List, Dict, Any, Set, Optional
from urllib.parse import quote from urllib.parse import quote
from datetime import datetime, timezone from datetime import datetime, timezone
import requests import requests
from psycopg2 import pool
from .base_provider import BaseProvider from .base_provider import BaseProvider
from core.provider_result import ProviderResult from core.provider_result import ProviderResult
from utils.helpers import _is_valid_domain from utils.helpers import _is_valid_domain
from core.logger import get_forensic_logger from core.logger import get_forensic_logger
# --- Global Instance for PostgreSQL Connection Pool ---
# This pool will be created once per worker process and is not part of the
# CrtShProvider instance, thus avoiding pickling errors.
db_pool = None
try:
db_pool = psycopg2.pool.SimpleConnectionPool(
1, 5,
host='crt.sh',
port=5432,
user='guest',
dbname='certwatch',
sslmode='prefer',
connect_timeout=60
)
# Use a generic logger here as this is at the module level
get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.")
except Exception as e:
get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.")
class CrtShProvider(BaseProvider): class CrtShProvider(BaseProvider):
""" """
Provider for querying crt.sh certificate transparency database. Provider for querying crt.sh certificate transparency database.
FIXED: Now properly creates domain and CA nodes instead of large entities. FIXED: Now properly creates domain and CA nodes instead of large entities.
REMOVED: All PostgreSQL logic to rely exclusively on the HTTP API for stability.
Returns standardized ProviderResult objects with caching support. Returns standardized ProviderResult objects with caching support.
""" """
@ -53,7 +32,7 @@ class CrtShProvider(BaseProvider):
self.base_url = "https://crt.sh/" self.base_url = "https://crt.sh/"
self._stop_event = None self._stop_event = None
# Initialize cache directory (separate from BaseProvider's HTTP cache) # Initialize cache directory
self.domain_cache_dir = Path('cache') / 'crtsh' self.domain_cache_dir = Path('cache') / 'crtsh'
self.domain_cache_dir.mkdir(parents=True, exist_ok=True) self.domain_cache_dir.mkdir(parents=True, exist_ok=True)
@ -116,8 +95,7 @@ class CrtShProvider(BaseProvider):
def query_domain(self, domain: str) -> ProviderResult: def query_domain(self, domain: str) -> ProviderResult:
""" """
FIXED: Query crt.sh for certificates containing the domain. Query crt.sh for certificates containing the domain via HTTP API.
Now properly creates domain and CA nodes instead of large entities.
Args: Args:
domain: Domain to investigate domain: Domain to investigate
@ -140,10 +118,10 @@ class CrtShProvider(BaseProvider):
if cache_status == "fresh": if cache_status == "fresh":
result = self._load_from_cache(cache_file) result = self._load_from_cache(cache_file)
self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}") self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}")
return result
else: # "stale" or "not_found" # For "stale" or "not_found", we must query the API.
# Query the API for the latest certificates new_raw_certs = self._query_crtsh_api(domain)
new_raw_certs = self._query_crtsh(domain)
if self._stop_event and self._stop_event.is_set(): if self._stop_event and self._stop_event.is_set():
return ProviderResult() return ProviderResult()
@ -167,33 +145,29 @@ class CrtShProvider(BaseProvider):
else: # "not_found" else: # "not_found"
raw_certificates_to_process = new_raw_certs raw_certificates_to_process = new_raw_certs
# FIXED: Process certificates to create proper domain and CA nodes
result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process) result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process)
self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)") self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)")
# Save the new result and the raw data to the cache # Save the new result and the raw data to the cache
self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain)
except (requests.exceptions.RequestException, psycopg2.Error) as e: except requests.exceptions.RequestException as e:
self.logger.logger.error(f"Upstream query failed for {domain}: {e}") self.logger.logger.error(f"Upstream query failed for {domain}: {e}")
if cache_status != "not_found": # **BUG FIX:** Always re-raise the exception. This signals a failure to the
result = self._load_from_cache(cache_file) # scanner, allowing its retry logic to handle the transient error.
self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.") raise e
else:
raise e # Re-raise if there's no cache to fall back on
return result return result
def query_ip(self, ip: str) -> ProviderResult: def query_ip(self, ip: str) -> ProviderResult:
""" """
Query crt.sh for certificates containing the IP address. crt.sh does not support IP-based certificate queries effectively via its API.
Note: crt.sh doesn't typically index by IP, so this returns empty results.
Args: Args:
ip: IP address to investigate ip: IP address to investigate
Returns: Returns:
Empty ProviderResult (crt.sh doesn't support IP-based certificate queries effectively) Empty ProviderResult
""" """
return ProviderResult() return ProviderResult()
@ -278,58 +252,6 @@ class CrtShProvider(BaseProvider):
except Exception as e: except Exception as e:
self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}") self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}")
def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh, trying the database first and falling back to the API."""
global db_pool
if db_pool:
try:
self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}")
return self._query_crtsh_db(domain)
except psycopg2.Error as e:
self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.")
return self._query_crtsh_api(domain)
else:
self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}")
return self._query_crtsh_api(domain)
def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh database for raw certificate data."""
global db_pool
conn = db_pool.getconn()
try:
with conn.cursor() as cursor:
query = """
SELECT
c.id,
x509_serialnumber(c.certificate) as serial_number,
x509_notbefore(c.certificate) as not_before,
x509_notafter(c.certificate) as not_after,
c.issuer_ca_id,
ca.name as issuer_name,
x509_commonname(c.certificate) as common_name,
identities(c.certificate)::text as name_value
FROM certificate c
LEFT JOIN ca ON c.issuer_ca_id = ca.id
WHERE identities(c.certificate) @@ plainto_tsquery(%s)
ORDER BY c.id DESC
LIMIT 5000;
"""
cursor.execute(query, (domain,))
results = []
columns = [desc[0] for desc in cursor.description]
for row in cursor.fetchall():
row_dict = dict(zip(columns, row))
if row_dict.get('not_before'):
row_dict['not_before'] = row_dict['not_before'].isoformat()
if row_dict.get('not_after'):
row_dict['not_after'] = row_dict['not_after'].isoformat()
results.append(row_dict)
self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.")
return results
finally:
db_pool.putconn(conn)
def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]: def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh API for raw certificate data.""" """Query crt.sh API for raw certificate data."""
url = f"{self.base_url}?q={quote(domain)}&output=json" url = f"{self.base_url}?q={quote(domain)}&output=json"
@ -351,8 +273,7 @@ class CrtShProvider(BaseProvider):
def _process_certificates_to_result_fixed(self, query_domain: str, certificates: List[Dict[str, Any]]) -> ProviderResult: def _process_certificates_to_result_fixed(self, query_domain: str, certificates: List[Dict[str, Any]]) -> ProviderResult:
""" """
FIXED: Process certificates to create proper domain and CA nodes. Process certificates to create proper domain and CA nodes.
Now creates individual domain nodes instead of large entities.
""" """
result = ProviderResult() result = ProviderResult()
@ -375,6 +296,7 @@ class CrtShProvider(BaseProvider):
processed_issuers = set() processed_issuers = set()
for i, cert_data in enumerate(certificates): for i, cert_data in enumerate(certificates):
# Check for stop event inside the loop to make it responsive.
if i % 10 == 0 and self._stop_event and self._stop_event.is_set(): if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
self.logger.logger.info(f"CrtSh processing cancelled at certificate {i} for domain: {query_domain}") self.logger.logger.info(f"CrtSh processing cancelled at certificate {i} for domain: {query_domain}")
break break
@ -383,10 +305,9 @@ class CrtShProvider(BaseProvider):
cert_domains = self._extract_domains_from_certificate(cert_data) cert_domains = self._extract_domains_from_certificate(cert_data)
all_discovered_domains.update(cert_domains) all_discovered_domains.update(cert_domains)
# FIXED: Create CA nodes for certificate issuers (not as domain metadata) # Create CA nodes for certificate issuers
issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', '')) issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', ''))
if issuer_name and issuer_name not in processed_issuers: if issuer_name and issuer_name not in processed_issuers:
# Create relationship from query domain to CA
result.add_relationship( result.add_relationship(
source_node=query_domain, source_node=query_domain,
target_node=issuer_name, target_node=issuer_name,
@ -403,7 +324,6 @@ class CrtShProvider(BaseProvider):
if not _is_valid_domain(cert_domain): if not _is_valid_domain(cert_domain):
continue continue
# Add certificate attributes to the domain
for key, value in cert_metadata.items(): for key, value in cert_metadata.items():
if value is not None: if value is not None:
result.add_attribute( result.add_attribute(
@ -416,12 +336,12 @@ class CrtShProvider(BaseProvider):
metadata={'certificate_id': cert_data.get('id')} metadata={'certificate_id': cert_data.get('id')}
) )
# Check for stop event before creating final relationships.
if self._stop_event and self._stop_event.is_set(): if self._stop_event and self._stop_event.is_set():
self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}") self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}")
return result return result
# FIXED: Create selective relationships to avoid large entities # Create selective relationships to avoid large entities
# Only create relationships to domains that are closely related
for discovered_domain in all_discovered_domains: for discovered_domain in all_discovered_domains:
if discovered_domain == query_domain: if discovered_domain == query_domain:
continue continue
@ -429,8 +349,6 @@ class CrtShProvider(BaseProvider):
if not _is_valid_domain(discovered_domain): if not _is_valid_domain(discovered_domain):
continue continue
# FIXED: Only create relationships for domains that share a meaningful connection
# This prevents creating too many relationships that trigger large entity creation
if self._should_create_relationship(query_domain, discovered_domain): if self._should_create_relationship(query_domain, discovered_domain):
confidence = self._calculate_domain_relationship_confidence( confidence = self._calculate_domain_relationship_confidence(
query_domain, discovered_domain, [], all_discovered_domains query_domain, discovered_domain, [], all_discovered_domains
@ -459,18 +377,14 @@ class CrtShProvider(BaseProvider):
def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool: def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool:
""" """
FIXED: Determine if a relationship should be created between two domains. Determine if a relationship should be created between two domains.
This helps avoid creating too many relationships that trigger large entity creation.
""" """
# Always create relationships for subdomains
if target_domain.endswith(f'.{source_domain}') or source_domain.endswith(f'.{target_domain}'): if target_domain.endswith(f'.{source_domain}') or source_domain.endswith(f'.{target_domain}'):
return True return True
# Create relationships for domains that share a common parent (up to 2 levels)
source_parts = source_domain.split('.') source_parts = source_domain.split('.')
target_parts = target_domain.split('.') target_parts = target_domain.split('.')
# Check if they share the same root domain (last 2 parts)
if len(source_parts) >= 2 and len(target_parts) >= 2: if len(source_parts) >= 2 and len(target_parts) >= 2:
source_root = '.'.join(source_parts[-2:]) source_root = '.'.join(source_parts[-2:])
target_root = '.'.join(target_parts[-2:]) target_root = '.'.join(target_parts[-2:])
@ -504,7 +418,6 @@ class CrtShProvider(BaseProvider):
metadata['is_currently_valid'] = self._is_cert_valid(cert_data) metadata['is_currently_valid'] = self._is_cert_valid(cert_data)
metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30 metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30
# Keep raw date format or convert to standard format
metadata['not_before'] = not_before.isoformat() metadata['not_before'] = not_before.isoformat()
metadata['not_after'] = not_after.isoformat() metadata['not_after'] = not_after.isoformat()
@ -586,14 +499,12 @@ class CrtShProvider(BaseProvider):
"""Extract all domains from certificate data.""" """Extract all domains from certificate data."""
domains = set() domains = set()
# Extract from common name
common_name = cert_data.get('common_name', '') common_name = cert_data.get('common_name', '')
if common_name: if common_name:
cleaned_cn = self._clean_domain_name(common_name) cleaned_cn = self._clean_domain_name(common_name)
if cleaned_cn: if cleaned_cn:
domains.update(cleaned_cn) domains.update(cleaned_cn)
# Extract from name_value field (contains SANs)
name_value = cert_data.get('name_value', '') name_value = cert_data.get('name_value', '')
if name_value: if name_value:
for line in name_value.split('\n'): for line in name_value.split('\n'):
@ -640,7 +551,6 @@ class CrtShProvider(BaseProvider):
"""Calculate confidence score for domain relationship based on various factors.""" """Calculate confidence score for domain relationship based on various factors."""
base_confidence = 0.9 base_confidence = 0.9
# Adjust confidence based on domain relationship context
relationship_context = self._determine_relationship_context(domain2, domain1) relationship_context = self._determine_relationship_context(domain2, domain1)
if relationship_context == 'exact_match': if relationship_context == 'exact_match':
@ -672,12 +582,10 @@ class CrtShProvider(BaseProvider):
""" """
cert_count = len(certificates) cert_count = len(certificates)
# Heuristic 1: Check if the number of certs hits a known hard limit.
if cert_count >= 10000: if cert_count >= 10000:
return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit." return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit."
# Heuristic 2: Check if all returned certificates are old. if cert_count > 1000:
if cert_count > 1000: # Only apply this for a reasonable number of certs
latest_expiry = None latest_expiry = None
for cert in certificates: for cert in certificates:
try: try:

View File

@ -474,6 +474,7 @@ input[type="text"]:focus, select:focus {
flex-wrap: wrap; flex-wrap: wrap;
gap: 0.75rem; gap: 0.75rem;
align-items: center; align-items: center;
max-height: 3rem;
} }
.legend-item { .legend-item {

View File

@ -329,7 +329,7 @@ class DNSReconApp {
console.log(`Scan started for ${target} with depth ${maxDepth}`); console.log(`Scan started for ${target} with depth ${maxDepth}`);
// Start polling immediately with faster interval for responsiveness // Start polling immediately with faster interval for responsiveness
this.startPolling(1000); this.startPolling();
// Force an immediate status update // Force an immediate status update
console.log('Forcing immediate status update...'); console.log('Forcing immediate status update...');
@ -738,7 +738,7 @@ class DNSReconApp {
this.setUIState('scanning', task_queue_size); this.setUIState('scanning', task_queue_size);
this.showSuccess('Scan is running'); this.showSuccess('Scan is running');
// Increase polling frequency for active scans // Increase polling frequency for active scans
this.startPolling(1000); // Poll every 1 second for running scans this.startPolling(5000); // Poll every 5 second for running scans
this.updateConnectionStatus('active'); this.updateConnectionStatus('active');
break; break;
@ -2039,6 +2039,27 @@ class DNSReconApp {
setTimeout(() => this.updateGraph(), 500); setTimeout(() => this.updateGraph(), 500);
} }
// Immediately update the modal view
if (this.graphManager) {
const largeEntityNode = this.graphManager.nodes.get(largeEntityId);
if (largeEntityNode && largeEntityNode.attributes) {
// Find and update the 'nodes' attribute
const nodesAttribute = largeEntityNode.attributes.find(attr => attr.name === 'nodes');
if (nodesAttribute && Array.isArray(nodesAttribute.value)) {
nodesAttribute.value = nodesAttribute.value.filter(id => id !== nodeId);
}
// Find and update the 'count' attribute
const countAttribute = largeEntityNode.attributes.find(attr => attr.name === 'count');
if (countAttribute) {
countAttribute.value = (countAttribute.value || 0) - 1;
}
// Re-render the modal with the updated data
this.showNodeModal(largeEntityNode);
}
}
} else { } else {
throw new Error(response.error || 'Extraction failed on the server.'); throw new Error(response.error || 'Extraction failed on the server.');
} }