|
|
|
|
@@ -2,43 +2,22 @@
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import re
|
|
|
|
|
import psycopg2
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import List, Dict, Any, Set, Optional
|
|
|
|
|
from urllib.parse import quote
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
import requests
|
|
|
|
|
from psycopg2 import pool
|
|
|
|
|
|
|
|
|
|
from .base_provider import BaseProvider
|
|
|
|
|
from core.provider_result import ProviderResult
|
|
|
|
|
from utils.helpers import _is_valid_domain
|
|
|
|
|
from core.logger import get_forensic_logger
|
|
|
|
|
|
|
|
|
|
# --- Global Instance for PostgreSQL Connection Pool ---
|
|
|
|
|
# This pool will be created once per worker process and is not part of the
|
|
|
|
|
# CrtShProvider instance, thus avoiding pickling errors.
|
|
|
|
|
db_pool = None
|
|
|
|
|
try:
|
|
|
|
|
db_pool = psycopg2.pool.SimpleConnectionPool(
|
|
|
|
|
1, 5,
|
|
|
|
|
host='crt.sh',
|
|
|
|
|
port=5432,
|
|
|
|
|
user='guest',
|
|
|
|
|
dbname='certwatch',
|
|
|
|
|
sslmode='prefer',
|
|
|
|
|
connect_timeout=60
|
|
|
|
|
)
|
|
|
|
|
# Use a generic logger here as this is at the module level
|
|
|
|
|
get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CrtShProvider(BaseProvider):
|
|
|
|
|
"""
|
|
|
|
|
Provider for querying crt.sh certificate transparency database.
|
|
|
|
|
FIXED: Now properly creates domain and CA nodes instead of large entities.
|
|
|
|
|
REMOVED: All PostgreSQL logic to rely exclusively on the HTTP API for stability.
|
|
|
|
|
Returns standardized ProviderResult objects with caching support.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
@@ -53,7 +32,7 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
self.base_url = "https://crt.sh/"
|
|
|
|
|
self._stop_event = None
|
|
|
|
|
|
|
|
|
|
# Initialize cache directory (separate from BaseProvider's HTTP cache)
|
|
|
|
|
# Initialize cache directory
|
|
|
|
|
self.domain_cache_dir = Path('cache') / 'crtsh'
|
|
|
|
|
self.domain_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
@@ -116,8 +95,7 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
|
|
|
|
|
def query_domain(self, domain: str) -> ProviderResult:
|
|
|
|
|
"""
|
|
|
|
|
FIXED: Query crt.sh for certificates containing the domain.
|
|
|
|
|
Now properly creates domain and CA nodes instead of large entities.
|
|
|
|
|
Query crt.sh for certificates containing the domain via HTTP API.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
domain: Domain to investigate
|
|
|
|
|
@@ -140,60 +118,56 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
if cache_status == "fresh":
|
|
|
|
|
result = self._load_from_cache(cache_file)
|
|
|
|
|
self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}")
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
else: # "stale" or "not_found"
|
|
|
|
|
# Query the API for the latest certificates
|
|
|
|
|
new_raw_certs = self._query_crtsh(domain)
|
|
|
|
|
# For "stale" or "not_found", we must query the API.
|
|
|
|
|
new_raw_certs = self._query_crtsh_api(domain)
|
|
|
|
|
|
|
|
|
|
if self._stop_event and self._stop_event.is_set():
|
|
|
|
|
return ProviderResult()
|
|
|
|
|
|
|
|
|
|
# Combine with old data if cache is stale
|
|
|
|
|
if cache_status == "stale":
|
|
|
|
|
old_raw_certs = self._load_raw_data_from_cache(cache_file)
|
|
|
|
|
combined_certs = old_raw_certs + new_raw_certs
|
|
|
|
|
|
|
|
|
|
if self._stop_event and self._stop_event.is_set():
|
|
|
|
|
return ProviderResult()
|
|
|
|
|
|
|
|
|
|
# Combine with old data if cache is stale
|
|
|
|
|
if cache_status == "stale":
|
|
|
|
|
old_raw_certs = self._load_raw_data_from_cache(cache_file)
|
|
|
|
|
combined_certs = old_raw_certs + new_raw_certs
|
|
|
|
|
|
|
|
|
|
# Deduplicate the combined list
|
|
|
|
|
seen_ids = set()
|
|
|
|
|
unique_certs = []
|
|
|
|
|
for cert in combined_certs:
|
|
|
|
|
cert_id = cert.get('id')
|
|
|
|
|
if cert_id not in seen_ids:
|
|
|
|
|
unique_certs.append(cert)
|
|
|
|
|
seen_ids.add(cert_id)
|
|
|
|
|
|
|
|
|
|
raw_certificates_to_process = unique_certs
|
|
|
|
|
self.logger.logger.info(f"Refreshed and merged cache for {domain}. Total unique certs: {len(raw_certificates_to_process)}")
|
|
|
|
|
else: # "not_found"
|
|
|
|
|
raw_certificates_to_process = new_raw_certs
|
|
|
|
|
# Deduplicate the combined list
|
|
|
|
|
seen_ids = set()
|
|
|
|
|
unique_certs = []
|
|
|
|
|
for cert in combined_certs:
|
|
|
|
|
cert_id = cert.get('id')
|
|
|
|
|
if cert_id not in seen_ids:
|
|
|
|
|
unique_certs.append(cert)
|
|
|
|
|
seen_ids.add(cert_id)
|
|
|
|
|
|
|
|
|
|
# FIXED: Process certificates to create proper domain and CA nodes
|
|
|
|
|
result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process)
|
|
|
|
|
self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)")
|
|
|
|
|
raw_certificates_to_process = unique_certs
|
|
|
|
|
self.logger.logger.info(f"Refreshed and merged cache for {domain}. Total unique certs: {len(raw_certificates_to_process)}")
|
|
|
|
|
else: # "not_found"
|
|
|
|
|
raw_certificates_to_process = new_raw_certs
|
|
|
|
|
|
|
|
|
|
result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process)
|
|
|
|
|
self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)")
|
|
|
|
|
|
|
|
|
|
# Save the new result and the raw data to the cache
|
|
|
|
|
self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain)
|
|
|
|
|
# Save the new result and the raw data to the cache
|
|
|
|
|
self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain)
|
|
|
|
|
|
|
|
|
|
except (requests.exceptions.RequestException, psycopg2.Error) as e:
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
self.logger.logger.error(f"Upstream query failed for {domain}: {e}")
|
|
|
|
|
if cache_status != "not_found":
|
|
|
|
|
result = self._load_from_cache(cache_file)
|
|
|
|
|
self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.")
|
|
|
|
|
else:
|
|
|
|
|
raise e # Re-raise if there's no cache to fall back on
|
|
|
|
|
# **BUG FIX:** Always re-raise the exception. This signals a failure to the
|
|
|
|
|
# scanner, allowing its retry logic to handle the transient error.
|
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
def query_ip(self, ip: str) -> ProviderResult:
|
|
|
|
|
"""
|
|
|
|
|
Query crt.sh for certificates containing the IP address.
|
|
|
|
|
Note: crt.sh doesn't typically index by IP, so this returns empty results.
|
|
|
|
|
crt.sh does not support IP-based certificate queries effectively via its API.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
ip: IP address to investigate
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Empty ProviderResult (crt.sh doesn't support IP-based certificate queries effectively)
|
|
|
|
|
Empty ProviderResult
|
|
|
|
|
"""
|
|
|
|
|
return ProviderResult()
|
|
|
|
|
|
|
|
|
|
@@ -277,59 +251,7 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
json.dump(cache_data, f, separators=(',', ':'), default=str)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}")
|
|
|
|
|
|
|
|
|
|
def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]:
|
|
|
|
|
"""Query crt.sh, trying the database first and falling back to the API."""
|
|
|
|
|
global db_pool
|
|
|
|
|
if db_pool:
|
|
|
|
|
try:
|
|
|
|
|
self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}")
|
|
|
|
|
return self._query_crtsh_db(domain)
|
|
|
|
|
except psycopg2.Error as e:
|
|
|
|
|
self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.")
|
|
|
|
|
return self._query_crtsh_api(domain)
|
|
|
|
|
else:
|
|
|
|
|
self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}")
|
|
|
|
|
return self._query_crtsh_api(domain)
|
|
|
|
|
|
|
|
|
|
def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]:
|
|
|
|
|
"""Query crt.sh database for raw certificate data."""
|
|
|
|
|
global db_pool
|
|
|
|
|
conn = db_pool.getconn()
|
|
|
|
|
try:
|
|
|
|
|
with conn.cursor() as cursor:
|
|
|
|
|
query = """
|
|
|
|
|
SELECT
|
|
|
|
|
c.id,
|
|
|
|
|
x509_serialnumber(c.certificate) as serial_number,
|
|
|
|
|
x509_notbefore(c.certificate) as not_before,
|
|
|
|
|
x509_notafter(c.certificate) as not_after,
|
|
|
|
|
c.issuer_ca_id,
|
|
|
|
|
ca.name as issuer_name,
|
|
|
|
|
x509_commonname(c.certificate) as common_name,
|
|
|
|
|
identities(c.certificate)::text as name_value
|
|
|
|
|
FROM certificate c
|
|
|
|
|
LEFT JOIN ca ON c.issuer_ca_id = ca.id
|
|
|
|
|
WHERE identities(c.certificate) @@ plainto_tsquery(%s)
|
|
|
|
|
ORDER BY c.id DESC
|
|
|
|
|
LIMIT 5000;
|
|
|
|
|
"""
|
|
|
|
|
cursor.execute(query, (domain,))
|
|
|
|
|
|
|
|
|
|
results = []
|
|
|
|
|
columns = [desc[0] for desc in cursor.description]
|
|
|
|
|
for row in cursor.fetchall():
|
|
|
|
|
row_dict = dict(zip(columns, row))
|
|
|
|
|
if row_dict.get('not_before'):
|
|
|
|
|
row_dict['not_before'] = row_dict['not_before'].isoformat()
|
|
|
|
|
if row_dict.get('not_after'):
|
|
|
|
|
row_dict['not_after'] = row_dict['not_after'].isoformat()
|
|
|
|
|
results.append(row_dict)
|
|
|
|
|
self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.")
|
|
|
|
|
return results
|
|
|
|
|
finally:
|
|
|
|
|
db_pool.putconn(conn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]:
|
|
|
|
|
"""Query crt.sh API for raw certificate data."""
|
|
|
|
|
url = f"{self.base_url}?q={quote(domain)}&output=json"
|
|
|
|
|
@@ -351,8 +273,7 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
|
|
|
|
|
def _process_certificates_to_result_fixed(self, query_domain: str, certificates: List[Dict[str, Any]]) -> ProviderResult:
|
|
|
|
|
"""
|
|
|
|
|
FIXED: Process certificates to create proper domain and CA nodes.
|
|
|
|
|
Now creates individual domain nodes instead of large entities.
|
|
|
|
|
Process certificates to create proper domain and CA nodes.
|
|
|
|
|
"""
|
|
|
|
|
result = ProviderResult()
|
|
|
|
|
|
|
|
|
|
@@ -375,6 +296,7 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
processed_issuers = set()
|
|
|
|
|
|
|
|
|
|
for i, cert_data in enumerate(certificates):
|
|
|
|
|
# Check for stop event inside the loop to make it responsive.
|
|
|
|
|
if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
|
|
|
|
|
self.logger.logger.info(f"CrtSh processing cancelled at certificate {i} for domain: {query_domain}")
|
|
|
|
|
break
|
|
|
|
|
@@ -383,10 +305,9 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
cert_domains = self._extract_domains_from_certificate(cert_data)
|
|
|
|
|
all_discovered_domains.update(cert_domains)
|
|
|
|
|
|
|
|
|
|
# FIXED: Create CA nodes for certificate issuers (not as domain metadata)
|
|
|
|
|
# Create CA nodes for certificate issuers
|
|
|
|
|
issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', ''))
|
|
|
|
|
if issuer_name and issuer_name not in processed_issuers:
|
|
|
|
|
# Create relationship from query domain to CA
|
|
|
|
|
result.add_relationship(
|
|
|
|
|
source_node=query_domain,
|
|
|
|
|
target_node=issuer_name,
|
|
|
|
|
@@ -403,7 +324,6 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
if not _is_valid_domain(cert_domain):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Add certificate attributes to the domain
|
|
|
|
|
for key, value in cert_metadata.items():
|
|
|
|
|
if value is not None:
|
|
|
|
|
result.add_attribute(
|
|
|
|
|
@@ -415,13 +335,13 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
confidence=0.9,
|
|
|
|
|
metadata={'certificate_id': cert_data.get('id')}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Check for stop event before creating final relationships.
|
|
|
|
|
if self._stop_event and self._stop_event.is_set():
|
|
|
|
|
self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}")
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
# FIXED: Create selective relationships to avoid large entities
|
|
|
|
|
# Only create relationships to domains that are closely related
|
|
|
|
|
# Create selective relationships to avoid large entities
|
|
|
|
|
for discovered_domain in all_discovered_domains:
|
|
|
|
|
if discovered_domain == query_domain:
|
|
|
|
|
continue
|
|
|
|
|
@@ -429,8 +349,6 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
if not _is_valid_domain(discovered_domain):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# FIXED: Only create relationships for domains that share a meaningful connection
|
|
|
|
|
# This prevents creating too many relationships that trigger large entity creation
|
|
|
|
|
if self._should_create_relationship(query_domain, discovered_domain):
|
|
|
|
|
confidence = self._calculate_domain_relationship_confidence(
|
|
|
|
|
query_domain, discovered_domain, [], all_discovered_domains
|
|
|
|
|
@@ -459,18 +377,14 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
|
|
|
|
|
def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
FIXED: Determine if a relationship should be created between two domains.
|
|
|
|
|
This helps avoid creating too many relationships that trigger large entity creation.
|
|
|
|
|
Determine if a relationship should be created between two domains.
|
|
|
|
|
"""
|
|
|
|
|
# Always create relationships for subdomains
|
|
|
|
|
if target_domain.endswith(f'.{source_domain}') or source_domain.endswith(f'.{target_domain}'):
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
# Create relationships for domains that share a common parent (up to 2 levels)
|
|
|
|
|
source_parts = source_domain.split('.')
|
|
|
|
|
target_parts = target_domain.split('.')
|
|
|
|
|
|
|
|
|
|
# Check if they share the same root domain (last 2 parts)
|
|
|
|
|
if len(source_parts) >= 2 and len(target_parts) >= 2:
|
|
|
|
|
source_root = '.'.join(source_parts[-2:])
|
|
|
|
|
target_root = '.'.join(target_parts[-2:])
|
|
|
|
|
@@ -504,7 +418,6 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
metadata['is_currently_valid'] = self._is_cert_valid(cert_data)
|
|
|
|
|
metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30
|
|
|
|
|
|
|
|
|
|
# Keep raw date format or convert to standard format
|
|
|
|
|
metadata['not_before'] = not_before.isoformat()
|
|
|
|
|
metadata['not_after'] = not_after.isoformat()
|
|
|
|
|
|
|
|
|
|
@@ -586,14 +499,12 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
"""Extract all domains from certificate data."""
|
|
|
|
|
domains = set()
|
|
|
|
|
|
|
|
|
|
# Extract from common name
|
|
|
|
|
common_name = cert_data.get('common_name', '')
|
|
|
|
|
if common_name:
|
|
|
|
|
cleaned_cn = self._clean_domain_name(common_name)
|
|
|
|
|
if cleaned_cn:
|
|
|
|
|
domains.update(cleaned_cn)
|
|
|
|
|
|
|
|
|
|
# Extract from name_value field (contains SANs)
|
|
|
|
|
name_value = cert_data.get('name_value', '')
|
|
|
|
|
if name_value:
|
|
|
|
|
for line in name_value.split('\n'):
|
|
|
|
|
@@ -640,7 +551,6 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
"""Calculate confidence score for domain relationship based on various factors."""
|
|
|
|
|
base_confidence = 0.9
|
|
|
|
|
|
|
|
|
|
# Adjust confidence based on domain relationship context
|
|
|
|
|
relationship_context = self._determine_relationship_context(domain2, domain1)
|
|
|
|
|
|
|
|
|
|
if relationship_context == 'exact_match':
|
|
|
|
|
@@ -672,12 +582,10 @@ class CrtShProvider(BaseProvider):
|
|
|
|
|
"""
|
|
|
|
|
cert_count = len(certificates)
|
|
|
|
|
|
|
|
|
|
# Heuristic 1: Check if the number of certs hits a known hard limit.
|
|
|
|
|
if cert_count >= 10000:
|
|
|
|
|
return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit."
|
|
|
|
|
|
|
|
|
|
# Heuristic 2: Check if all returned certificates are old.
|
|
|
|
|
if cert_count > 1000: # Only apply this for a reasonable number of certs
|
|
|
|
|
if cert_count > 1000:
|
|
|
|
|
latest_expiry = None
|
|
|
|
|
for cert in certificates:
|
|
|
|
|
try:
|
|
|
|
|
|