some fixes for UX, correlation engine

This commit is contained in:
overcuriousity 2025-09-20 18:19:10 +02:00
parent 3ee23c9d05
commit bcd79ae2f5
5 changed files with 76 additions and 146 deletions

View File

@ -605,11 +605,6 @@ class Scanner:
processing_key = (provider_name, target_item)
self.currently_processing.discard(processing_key)
# PHASE 2: Run correlations on all discovered nodes
if not self._is_stop_requested():
print(f"\n=== PHASE 2: Running correlation analysis ===")
self._run_correlation_phase(max_depth, processed_tasks)
except Exception as e:
traceback.print_exc()
self.status = ScanStatus.FAILED
@ -633,6 +628,10 @@ class Scanner:
else:
self.status = ScanStatus.COMPLETED
if self.status in [ScanStatus.COMPLETED, ScanStatus.STOPPED]:
print(f"\n=== PHASE 2: Running correlation analysis ===")
self._run_correlation_phase(max_depth, processed_tasks)
self.status_logger_stop_event.set()
if self.status_logger_thread and self.status_logger_thread.is_alive():
self.status_logger_thread.join(timeout=2.0)

View File

@ -27,6 +27,7 @@ class CorrelationProvider(BaseProvider):
'cert_validity_period_days',
'cert_issuer_name',
'cert_entry_timestamp',
'cert_serial_number', # useless
'cert_not_before',
'cert_not_after',
'dns_ttl',

View File

@ -2,43 +2,22 @@
import json
import re
import psycopg2
from pathlib import Path
from typing import List, Dict, Any, Set, Optional
from urllib.parse import quote
from datetime import datetime, timezone
import requests
from psycopg2 import pool
from .base_provider import BaseProvider
from core.provider_result import ProviderResult
from utils.helpers import _is_valid_domain
from core.logger import get_forensic_logger
# --- Global Instance for PostgreSQL Connection Pool ---
# This pool will be created once per worker process and is not part of the
# CrtShProvider instance, thus avoiding pickling errors.
db_pool = None
try:
db_pool = psycopg2.pool.SimpleConnectionPool(
1, 5,
host='crt.sh',
port=5432,
user='guest',
dbname='certwatch',
sslmode='prefer',
connect_timeout=60
)
# Use a generic logger here as this is at the module level
get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.")
except Exception as e:
get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.")
class CrtShProvider(BaseProvider):
"""
Provider for querying crt.sh certificate transparency database.
FIXED: Now properly creates domain and CA nodes instead of large entities.
REMOVED: All PostgreSQL logic to rely exclusively on the HTTP API for stability.
Returns standardized ProviderResult objects with caching support.
"""
@ -53,7 +32,7 @@ class CrtShProvider(BaseProvider):
self.base_url = "https://crt.sh/"
self._stop_event = None
# Initialize cache directory (separate from BaseProvider's HTTP cache)
# Initialize cache directory
self.domain_cache_dir = Path('cache') / 'crtsh'
self.domain_cache_dir.mkdir(parents=True, exist_ok=True)
@ -116,8 +95,7 @@ class CrtShProvider(BaseProvider):
def query_domain(self, domain: str) -> ProviderResult:
"""
FIXED: Query crt.sh for certificates containing the domain.
Now properly creates domain and CA nodes instead of large entities.
Query crt.sh for certificates containing the domain via HTTP API.
Args:
domain: Domain to investigate
@ -140,10 +118,10 @@ class CrtShProvider(BaseProvider):
if cache_status == "fresh":
result = self._load_from_cache(cache_file)
self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}")
return result
else: # "stale" or "not_found"
# Query the API for the latest certificates
new_raw_certs = self._query_crtsh(domain)
# For "stale" or "not_found", we must query the API.
new_raw_certs = self._query_crtsh_api(domain)
if self._stop_event and self._stop_event.is_set():
return ProviderResult()
@ -167,33 +145,29 @@ class CrtShProvider(BaseProvider):
else: # "not_found"
raw_certificates_to_process = new_raw_certs
# FIXED: Process certificates to create proper domain and CA nodes
result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process)
self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)")
# Save the new result and the raw data to the cache
self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain)
except (requests.exceptions.RequestException, psycopg2.Error) as e:
except requests.exceptions.RequestException as e:
self.logger.logger.error(f"Upstream query failed for {domain}: {e}")
if cache_status != "not_found":
result = self._load_from_cache(cache_file)
self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.")
else:
raise e # Re-raise if there's no cache to fall back on
# **BUG FIX:** Always re-raise the exception. This signals a failure to the
# scanner, allowing its retry logic to handle the transient error.
raise e
return result
def query_ip(self, ip: str) -> ProviderResult:
"""
Query crt.sh for certificates containing the IP address.
Note: crt.sh doesn't typically index by IP, so this returns empty results.
crt.sh does not support IP-based certificate queries effectively via its API.
Args:
ip: IP address to investigate
Returns:
Empty ProviderResult (crt.sh doesn't support IP-based certificate queries effectively)
Empty ProviderResult
"""
return ProviderResult()
@ -278,58 +252,6 @@ class CrtShProvider(BaseProvider):
except Exception as e:
self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}")
def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh, trying the database first and falling back to the API."""
global db_pool
if db_pool:
try:
self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}")
return self._query_crtsh_db(domain)
except psycopg2.Error as e:
self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.")
return self._query_crtsh_api(domain)
else:
self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}")
return self._query_crtsh_api(domain)
def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh database for raw certificate data."""
global db_pool
conn = db_pool.getconn()
try:
with conn.cursor() as cursor:
query = """
SELECT
c.id,
x509_serialnumber(c.certificate) as serial_number,
x509_notbefore(c.certificate) as not_before,
x509_notafter(c.certificate) as not_after,
c.issuer_ca_id,
ca.name as issuer_name,
x509_commonname(c.certificate) as common_name,
identities(c.certificate)::text as name_value
FROM certificate c
LEFT JOIN ca ON c.issuer_ca_id = ca.id
WHERE identities(c.certificate) @@ plainto_tsquery(%s)
ORDER BY c.id DESC
LIMIT 5000;
"""
cursor.execute(query, (domain,))
results = []
columns = [desc[0] for desc in cursor.description]
for row in cursor.fetchall():
row_dict = dict(zip(columns, row))
if row_dict.get('not_before'):
row_dict['not_before'] = row_dict['not_before'].isoformat()
if row_dict.get('not_after'):
row_dict['not_after'] = row_dict['not_after'].isoformat()
results.append(row_dict)
self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.")
return results
finally:
db_pool.putconn(conn)
def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh API for raw certificate data."""
url = f"{self.base_url}?q={quote(domain)}&output=json"
@ -351,8 +273,7 @@ class CrtShProvider(BaseProvider):
def _process_certificates_to_result_fixed(self, query_domain: str, certificates: List[Dict[str, Any]]) -> ProviderResult:
"""
FIXED: Process certificates to create proper domain and CA nodes.
Now creates individual domain nodes instead of large entities.
Process certificates to create proper domain and CA nodes.
"""
result = ProviderResult()
@ -375,6 +296,7 @@ class CrtShProvider(BaseProvider):
processed_issuers = set()
for i, cert_data in enumerate(certificates):
# Check for stop event inside the loop to make it responsive.
if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
self.logger.logger.info(f"CrtSh processing cancelled at certificate {i} for domain: {query_domain}")
break
@ -383,10 +305,9 @@ class CrtShProvider(BaseProvider):
cert_domains = self._extract_domains_from_certificate(cert_data)
all_discovered_domains.update(cert_domains)
# FIXED: Create CA nodes for certificate issuers (not as domain metadata)
# Create CA nodes for certificate issuers
issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', ''))
if issuer_name and issuer_name not in processed_issuers:
# Create relationship from query domain to CA
result.add_relationship(
source_node=query_domain,
target_node=issuer_name,
@ -403,7 +324,6 @@ class CrtShProvider(BaseProvider):
if not _is_valid_domain(cert_domain):
continue
# Add certificate attributes to the domain
for key, value in cert_metadata.items():
if value is not None:
result.add_attribute(
@ -416,12 +336,12 @@ class CrtShProvider(BaseProvider):
metadata={'certificate_id': cert_data.get('id')}
)
# Check for stop event before creating final relationships.
if self._stop_event and self._stop_event.is_set():
self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}")
return result
# FIXED: Create selective relationships to avoid large entities
# Only create relationships to domains that are closely related
# Create selective relationships to avoid large entities
for discovered_domain in all_discovered_domains:
if discovered_domain == query_domain:
continue
@ -429,8 +349,6 @@ class CrtShProvider(BaseProvider):
if not _is_valid_domain(discovered_domain):
continue
# FIXED: Only create relationships for domains that share a meaningful connection
# This prevents creating too many relationships that trigger large entity creation
if self._should_create_relationship(query_domain, discovered_domain):
confidence = self._calculate_domain_relationship_confidence(
query_domain, discovered_domain, [], all_discovered_domains
@ -459,18 +377,14 @@ class CrtShProvider(BaseProvider):
def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool:
"""
FIXED: Determine if a relationship should be created between two domains.
This helps avoid creating too many relationships that trigger large entity creation.
Determine if a relationship should be created between two domains.
"""
# Always create relationships for subdomains
if target_domain.endswith(f'.{source_domain}') or source_domain.endswith(f'.{target_domain}'):
return True
# Create relationships for domains that share a common parent (up to 2 levels)
source_parts = source_domain.split('.')
target_parts = target_domain.split('.')
# Check if they share the same root domain (last 2 parts)
if len(source_parts) >= 2 and len(target_parts) >= 2:
source_root = '.'.join(source_parts[-2:])
target_root = '.'.join(target_parts[-2:])
@ -504,7 +418,6 @@ class CrtShProvider(BaseProvider):
metadata['is_currently_valid'] = self._is_cert_valid(cert_data)
metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30
# Keep raw date format or convert to standard format
metadata['not_before'] = not_before.isoformat()
metadata['not_after'] = not_after.isoformat()
@ -586,14 +499,12 @@ class CrtShProvider(BaseProvider):
"""Extract all domains from certificate data."""
domains = set()
# Extract from common name
common_name = cert_data.get('common_name', '')
if common_name:
cleaned_cn = self._clean_domain_name(common_name)
if cleaned_cn:
domains.update(cleaned_cn)
# Extract from name_value field (contains SANs)
name_value = cert_data.get('name_value', '')
if name_value:
for line in name_value.split('\n'):
@ -640,7 +551,6 @@ class CrtShProvider(BaseProvider):
"""Calculate confidence score for domain relationship based on various factors."""
base_confidence = 0.9
# Adjust confidence based on domain relationship context
relationship_context = self._determine_relationship_context(domain2, domain1)
if relationship_context == 'exact_match':
@ -672,12 +582,10 @@ class CrtShProvider(BaseProvider):
"""
cert_count = len(certificates)
# Heuristic 1: Check if the number of certs hits a known hard limit.
if cert_count >= 10000:
return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit."
# Heuristic 2: Check if all returned certificates are old.
if cert_count > 1000: # Only apply this for a reasonable number of certs
if cert_count > 1000:
latest_expiry = None
for cert in certificates:
try:

View File

@ -474,6 +474,7 @@ input[type="text"]:focus, select:focus {
flex-wrap: wrap;
gap: 0.75rem;
align-items: center;
max-height: 3rem;
}
.legend-item {

View File

@ -329,7 +329,7 @@ class DNSReconApp {
console.log(`Scan started for ${target} with depth ${maxDepth}`);
// Start polling immediately with faster interval for responsiveness
this.startPolling(1000);
this.startPolling();
// Force an immediate status update
console.log('Forcing immediate status update...');
@ -738,7 +738,7 @@ class DNSReconApp {
this.setUIState('scanning', task_queue_size);
this.showSuccess('Scan is running');
// Increase polling frequency for active scans
this.startPolling(1000); // Poll every 1 second for running scans
this.startPolling(5000); // Poll every 5 second for running scans
this.updateConnectionStatus('active');
break;
@ -2039,6 +2039,27 @@ class DNSReconApp {
setTimeout(() => this.updateGraph(), 500);
}
// Immediately update the modal view
if (this.graphManager) {
const largeEntityNode = this.graphManager.nodes.get(largeEntityId);
if (largeEntityNode && largeEntityNode.attributes) {
// Find and update the 'nodes' attribute
const nodesAttribute = largeEntityNode.attributes.find(attr => attr.name === 'nodes');
if (nodesAttribute && Array.isArray(nodesAttribute.value)) {
nodesAttribute.value = nodesAttribute.value.filter(id => id !== nodeId);
}
// Find and update the 'count' attribute
const countAttribute = largeEntityNode.attributes.find(attr => attr.name === 'count');
if (countAttribute) {
countAttribute.value = (countAttribute.value || 0) - 1;
}
// Re-render the modal with the updated data
this.showNodeModal(largeEntityNode);
}
}
} else {
throw new Error(response.error || 'Extraction failed on the server.');
}