dnsrecon/providers/crtsh_provider.py
2025-09-16 15:36:29 +02:00

745 lines
30 KiB
Python

# dnsrecon/providers/crtsh_provider.py
import json
import re
import os
from pathlib import Path
from typing import List, Dict, Any, Tuple, Set
from urllib.parse import quote
from datetime import datetime, timezone
import requests
from .base_provider import BaseProvider
from utils.helpers import _is_valid_domain
class CrtShProvider(BaseProvider):
"""
Provider for querying crt.sh certificate transparency database.
Now uses session-specific configuration and caching with accumulative behavior.
"""
def __init__(self, name=None, session_config=None):
"""Initialize CrtSh provider with session-specific configuration."""
super().__init__(
name="crtsh",
rate_limit=60,
timeout=15,
session_config=session_config
)
self.base_url = "https://crt.sh/"
self._stop_event = None
# Initialize cache directory
self.cache_dir = Path('cache') / 'crtsh'
self.cache_dir.mkdir(parents=True, exist_ok=True)
def get_name(self) -> str:
"""Return the provider name."""
return "crtsh"
def get_display_name(self) -> str:
"""Return the provider display name for the UI."""
return "crt.sh"
def requires_api_key(self) -> bool:
"""Return True if the provider requires an API key."""
return False
def get_eligibility(self) -> Dict[str, bool]:
"""Return a dictionary indicating if the provider can query domains and/or IPs."""
return {'domains': True, 'ips': False}
def is_available(self) -> bool:
"""
Check if the provider is configured to be used.
This method is intentionally simple and does not perform a network request
to avoid blocking application startup.
"""
return True
def _get_cache_file_path(self, domain: str) -> Path:
"""Generate cache file path for a domain."""
# Sanitize domain for filename safety
safe_domain = domain.replace('.', '_').replace('/', '_').replace('\\', '_')
return self.cache_dir / f"{safe_domain}.json"
def _get_cache_status(self, cache_file_path: Path) -> str:
"""
Check cache status for a domain.
Returns: 'not_found', 'fresh', or 'stale'
"""
if not cache_file_path.exists():
return "not_found"
try:
with open(cache_file_path, 'r') as f:
cache_data = json.load(f)
last_query_str = cache_data.get("last_upstream_query")
if not last_query_str:
return "stale" # Invalid cache format
last_query = datetime.fromisoformat(last_query_str.replace('Z', '+00:00'))
hours_since_query = (datetime.now(timezone.utc) - last_query).total_seconds() / 3600
cache_timeout = self.config.cache_timeout_hours
if hours_since_query < cache_timeout:
return "fresh"
else:
return "stale"
except (json.JSONDecodeError, ValueError, KeyError) as e:
self.logger.logger.warning(f"Invalid cache file format for {cache_file_path}: {e}")
return "stale"
def _load_cached_certificates(self, cache_file_path: Path) -> List[Dict[str, Any]]:
"""Load certificates from cache file."""
try:
with open(cache_file_path, 'r') as f:
cache_data = json.load(f)
return cache_data.get('certificates', [])
except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
self.logger.logger.error(f"Failed to load cached certificates from {cache_file_path}: {e}")
return []
def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]:
"""
Query crt.sh API for raw certificate data.
Raises exceptions for network errors to allow core logic to retry.
"""
url = f"{self.base_url}?q={quote(domain)}&output=json"
response = self.make_request(url, target_indicator=domain)
if not response or response.status_code != 200:
# This could be a temporary error - raise exception so core can retry
raise requests.exceptions.RequestException(f"crt.sh API returned status {response.status_code if response else 'None'}")
certificates = response.json()
if not certificates:
return []
return certificates
def _parse_issuer_organization(self, issuer_dn: str) -> str:
"""
Parse the issuer Distinguished Name to extract just the organization name.
Args:
issuer_dn: Full issuer DN string (e.g., "C=US, O=Let's Encrypt, CN=R11")
Returns:
Organization name (e.g., "Let's Encrypt") or original string if parsing fails
"""
if not issuer_dn:
return issuer_dn
try:
# Split by comma and look for O= component
components = [comp.strip() for comp in issuer_dn.split(',')]
for component in components:
if component.startswith('O='):
# Extract the value after O=
org_name = component[2:].strip()
# Remove quotes if present
if org_name.startswith('"') and org_name.endswith('"'):
org_name = org_name[1:-1]
return org_name
# If no O= component found, return the original string
return issuer_dn
except Exception as e:
self.logger.logger.debug(f"Failed to parse issuer DN '{issuer_dn}': {e}")
return issuer_dn
def _parse_certificate_date(self, date_string: str) -> datetime:
"""
Parse certificate date from crt.sh format.
Args:
date_string: Date string from crt.sh API
Returns:
Parsed datetime object in UTC
"""
if not date_string:
raise ValueError("Empty date string")
try:
# Handle various possible formats from crt.sh
if date_string.endswith('Z'):
return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc)
elif '+' in date_string or date_string.endswith('UTC'):
# Handle timezone-aware strings
date_string = date_string.replace('UTC', '').strip()
if '+' in date_string:
date_string = date_string.split('+')[0]
return datetime.fromisoformat(date_string).replace(tzinfo=timezone.utc)
else:
# Assume UTC if no timezone specified
return datetime.fromisoformat(date_string).replace(tzinfo=timezone.utc)
except Exception as e:
# Fallback: try parsing without timezone info and assume UTC
try:
return datetime.strptime(date_string[:19], "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc)
except Exception:
raise ValueError(f"Unable to parse date: {date_string}") from e
def _is_cert_valid(self, cert_data: Dict[str, Any]) -> bool:
"""
Check if a certificate is currently valid based on its expiry date.
Args:
cert_data: Certificate data from crt.sh
Returns:
True if certificate is currently valid (not expired)
"""
try:
not_after_str = cert_data.get('not_after')
if not not_after_str:
return False
not_after_date = self._parse_certificate_date(not_after_str)
not_before_str = cert_data.get('not_before')
now = datetime.now(timezone.utc)
# Check if certificate is within valid date range
is_not_expired = not_after_date > now
if not_before_str:
not_before_date = self._parse_certificate_date(not_before_str)
is_not_before_valid = not_before_date <= now
return is_not_expired and is_not_before_valid
return is_not_expired
except Exception as e:
self.logger.logger.debug(f"Certificate validity check failed: {e}")
return False
def _extract_certificate_metadata(self, cert_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract comprehensive metadata from certificate data.
Args:
cert_data: Raw certificate data from crt.sh
Returns:
Comprehensive certificate metadata dictionary
"""
# Parse the issuer name to get just the organization
raw_issuer_name = cert_data.get('issuer_name', '')
parsed_issuer_name = self._parse_issuer_organization(raw_issuer_name)
metadata = {
'certificate_id': cert_data.get('id'),
'serial_number': cert_data.get('serial_number'),
'issuer_name': parsed_issuer_name, # Use parsed organization name
#'issuer_name_full': raw_issuer_name, # deliberately left out, because its not useful in most cases
'issuer_ca_id': cert_data.get('issuer_ca_id'),
'common_name': cert_data.get('common_name'),
'not_before': cert_data.get('not_before'),
'not_after': cert_data.get('not_after'),
'entry_timestamp': cert_data.get('entry_timestamp'),
'source': 'crt.sh'
}
try:
if metadata['not_before'] and metadata['not_after']:
not_before = self._parse_certificate_date(metadata['not_before'])
not_after = self._parse_certificate_date(metadata['not_after'])
metadata['validity_period_days'] = (not_after - not_before).days
metadata['is_currently_valid'] = self._is_cert_valid(cert_data)
metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30
# Add human-readable dates
metadata['not_before'] = not_before.strftime('%Y-%m-%d %H:%M:%S UTC')
metadata['not_after'] = not_after.strftime('%Y-%m-%d %H:%M:%S UTC')
except Exception as e:
self.logger.logger.debug(f"Error computing certificate metadata: {e}")
metadata['is_currently_valid'] = False
metadata['expires_soon'] = False
return metadata
def query_domain(self, domain: str) -> List[Tuple[str, str, str, float, Dict[str, Any]]]:
"""
Query crt.sh for certificates containing the domain with caching support.
Properly raises exceptions for network errors to allow core logic retries.
"""
if not _is_valid_domain(domain):
return []
if self._stop_event and self._stop_event.is_set():
return []
cache_file = self._get_cache_file_path(domain)
cache_status = self._get_cache_status(cache_file)
processed_certificates = []
try:
if cache_status == "fresh":
processed_certificates = self._load_cached_certificates(cache_file)
self.logger.logger.info(f"Using cached processed data for {domain} ({len(processed_certificates)} certificates)")
else: # "stale" or "not_found"
raw_certificates = self._query_crtsh_api(domain)
if self._stop_event and self._stop_event.is_set():
return []
# Process raw data into the application's expected format
current_processed_certs = [self._extract_certificate_metadata(cert) for cert in raw_certificates]
if cache_status == "stale":
# Append new processed certs to existing ones
processed_certificates = self._append_to_cache(cache_file, current_processed_certs)
self.logger.logger.info(f"Refreshed and appended cache for {domain}")
else: # "not_found"
# Create a new cache file with the processed certs, even if empty
self._create_cache_file(cache_file, domain, current_processed_certs)
processed_certificates = current_processed_certs
self.logger.logger.info(f"Cached fresh data for {domain} ({len(processed_certificates)} certificates)")
except requests.exceptions.RequestException as e:
self.logger.logger.error(f"API query failed for {domain}: {e}")
if cache_status != "not_found":
processed_certificates = self._load_cached_certificates(cache_file)
self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.")
else:
raise e # Re-raise if there's no cache to fall back on
if not processed_certificates:
return []
return self._process_certificates_to_relationships(domain, processed_certificates)
def _create_cache_file(self, cache_file_path: Path, domain: str, processed_certificates: List[Dict[str, Any]]) -> None:
"""Create new cache file with processed certificates."""
try:
cache_data = {
"domain": domain,
"last_upstream_query": datetime.now(timezone.utc).isoformat(),
"certificates": processed_certificates # Store processed data
}
cache_file_path.parent.mkdir(parents=True, exist_ok=True)
with open(cache_file_path, 'w') as f:
json.dump(cache_data, f, separators=(',', ':'))
except Exception as e:
self.logger.logger.warning(f"Failed to create cache file for {domain}: {e}")
def _append_to_cache(self, cache_file_path: Path, new_processed_certificates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Append new processed certificates to existing cache and return all certificates."""
try:
with open(cache_file_path, 'r') as f:
cache_data = json.load(f)
existing_ids = {cert.get('certificate_id') for cert in cache_data.get('certificates', [])}
for cert in new_processed_certificates:
if cert.get('certificate_id') not in existing_ids:
cache_data['certificates'].append(cert)
cache_data['last_upstream_query'] = datetime.now(timezone.utc).isoformat()
with open(cache_file_path, 'w') as f:
json.dump(cache_data, f, separators=(',', ':'))
return cache_data['certificates']
except Exception as e:
self.logger.logger.warning(f"Failed to append to cache: {e}")
return new_processed_certificates
def _process_certificates_to_relationships(self, domain: str, certificates: List[Dict[str, Any]]) -> List[Tuple[str, str, str, float, Dict[str, Any]]]:
"""
Process certificates to relationships using existing logic.
This method contains the original processing logic from query_domain.
"""
relationships = []
# Check for cancellation before processing
if self._stop_event and self._stop_event.is_set():
print(f"CrtSh processing cancelled before processing for domain: {domain}")
return []
# Aggregate certificate data by domain
domain_certificates = {}
all_discovered_domains = set()
# Process certificates with cancellation checking
for i, cert_data in enumerate(certificates):
# Check for cancellation every 5 certificates for faster response
if i % 5 == 0 and self._stop_event and self._stop_event.is_set():
print(f"CrtSh processing cancelled at certificate {i} for domain: {domain}")
break
cert_metadata = self._extract_certificate_metadata(cert_data)
cert_domains = self._extract_domains_from_certificate(cert_data)
# Add all domains from this certificate to our tracking
all_discovered_domains.update(cert_domains)
for cert_domain in cert_domains:
if not _is_valid_domain(cert_domain):
continue
# Initialize domain certificate list if needed
if cert_domain not in domain_certificates:
domain_certificates[cert_domain] = []
# Add this certificate to the domain's certificate list
domain_certificates[cert_domain].append(cert_metadata)
# Final cancellation check before creating relationships
if self._stop_event and self._stop_event.is_set():
print(f"CrtSh query cancelled before relationship creation for domain: {domain}")
return []
# Create relationships from query domain to ALL discovered domains with stop checking
for i, discovered_domain in enumerate(all_discovered_domains):
if discovered_domain == domain:
continue # Skip self-relationships
# Check for cancellation every 10 relationships
if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
print(f"CrtSh relationship creation cancelled for domain: {domain}")
break
if not _is_valid_domain(discovered_domain):
continue
# Get certificates for both domains
query_domain_certs = domain_certificates.get(domain, [])
discovered_domain_certs = domain_certificates.get(discovered_domain, [])
# Find shared certificates (for metadata purposes)
shared_certificates = self._find_shared_certificates(query_domain_certs, discovered_domain_certs)
# Calculate confidence based on relationship type and shared certificates
confidence = self._calculate_domain_relationship_confidence(
domain, discovered_domain, shared_certificates, all_discovered_domains
)
# Create comprehensive raw data for the relationship
relationship_raw_data = {
'relationship_type': 'certificate_discovery',
'shared_certificates': shared_certificates,
'total_shared_certs': len(shared_certificates),
'discovery_context': self._determine_relationship_context(discovered_domain, domain),
'domain_certificates': {
domain: self._summarize_certificates(query_domain_certs),
discovered_domain: self._summarize_certificates(discovered_domain_certs)
}
}
# Create domain -> domain relationship
relationships.append((
domain,
discovered_domain,
'san_certificate',
confidence,
relationship_raw_data
))
# Log the relationship discovery
self.log_relationship_discovery(
source_node=domain,
target_node=discovered_domain,
relationship_type='san_certificate',
confidence_score=confidence,
raw_data=relationship_raw_data,
discovery_method="certificate_transparency_analysis"
)
return relationships
def _find_shared_certificates(self, certs1: List[Dict[str, Any]], certs2: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Find certificates that are shared between two domain certificate lists.
Args:
certs1: First domain's certificates
certs2: Second domain's certificates
Returns:
List of shared certificate metadata
"""
shared = []
# Create a set of certificate IDs from the first list for quick lookup
cert1_ids = set()
for cert in certs1:
cert_id = cert.get('certificate_id')
# Ensure the ID is not None and is a hashable type before adding to the set
if cert_id and isinstance(cert_id, (int, str, float, bool, tuple)):
cert1_ids.add(cert_id)
# Find certificates in the second list that match
for cert in certs2:
cert_id = cert.get('certificate_id')
if cert_id and isinstance(cert_id, (int, str, float, bool, tuple)):
if cert_id in cert1_ids:
shared.append(cert)
return shared
def _summarize_certificates(self, certificates: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Create a summary of certificates for a domain.
Args:
certificates: List of certificate metadata
Returns:
Summary dictionary with aggregate statistics
"""
if not certificates:
return {
'total_certificates': 0,
'valid_certificates': 0,
'expired_certificates': 0,
'expires_soon_count': 0,
'unique_issuers': [],
'latest_certificate': None,
'has_valid_cert': False,
'certificate_details': [] # Always include empty list
}
valid_count = sum(1 for cert in certificates if cert.get('is_currently_valid'))
expired_count = len(certificates) - valid_count
expires_soon_count = sum(1 for cert in certificates if cert.get('expires_soon'))
# Get unique issuers (using parsed organization names)
unique_issuers = list(set(cert.get('issuer_name') for cert in certificates if cert.get('issuer_name')))
# Find the most recent certificate
latest_cert = None
latest_date = None
for cert in certificates:
try:
if cert.get('not_before'):
cert_date = self._parse_certificate_date(cert['not_before'])
if latest_date is None or cert_date > latest_date:
latest_date = cert_date
latest_cert = cert
except Exception:
continue
# Sort certificates by date for better display (newest first)
sorted_certificates = sorted(
certificates,
key=lambda c: self._get_certificate_sort_date(c),
reverse=True
)
return {
'total_certificates': len(certificates),
'valid_certificates': valid_count,
'expired_certificates': expired_count,
'expires_soon_count': expires_soon_count,
'unique_issuers': unique_issuers,
'latest_certificate': latest_cert,
'has_valid_cert': valid_count > 0,
'certificate_details': sorted_certificates # Include full certificate details
}
def _get_certificate_sort_date(self, cert: Dict[str, Any]) -> datetime:
"""
Get a sortable date from certificate data for chronological ordering.
Args:
cert: Certificate metadata dictionary
Returns:
Datetime object for sorting (falls back to epoch if parsing fails)
"""
try:
# Try not_before first (issue date)
if cert.get('not_before'):
return self._parse_certificate_date(cert['not_before'])
# Fall back to entry_timestamp if available
if cert.get('entry_timestamp'):
return self._parse_certificate_date(cert['entry_timestamp'])
# Last resort - return a very old date for certificates without dates
return datetime(1970, 1, 1, tzinfo=timezone.utc)
except Exception:
# If all parsing fails, return epoch
return datetime(1970, 1, 1, tzinfo=timezone.utc)
def _calculate_domain_relationship_confidence(self, domain1: str, domain2: str,
shared_certificates: List[Dict[str, Any]],
all_discovered_domains: Set[str]) -> float:
"""
Calculate confidence score for domain relationship based on various factors.
Args:
domain1: Source domain (query domain)
domain2: Target domain (discovered domain)
shared_certificates: List of shared certificate metadata
all_discovered_domains: All domains discovered in this query
Returns:
Confidence score between 0.0 and 1.0
"""
base_confidence = 0.9
# Adjust confidence based on domain relationship context
relationship_context = self._determine_relationship_context(domain2, domain1)
if relationship_context == 'exact_match':
context_bonus = 0.0 # This shouldn't happen, but just in case
elif relationship_context == 'subdomain':
context_bonus = 0.1 # High confidence for subdomains
elif relationship_context == 'parent_domain':
context_bonus = 0.05 # Medium confidence for parent domains
else:
context_bonus = 0.0 # Related domains get base confidence
# Adjust confidence based on shared certificates
if shared_certificates:
shared_count = len(shared_certificates)
if shared_count >= 3:
shared_bonus = 0.1
elif shared_count >= 2:
shared_bonus = 0.05
else:
shared_bonus = 0.02
# Additional bonus for valid shared certificates
valid_shared = sum(1 for cert in shared_certificates if cert.get('is_currently_valid'))
if valid_shared > 0:
validity_bonus = 0.05
else:
validity_bonus = 0.0
else:
# Even without shared certificates, domains found in the same query have some relationship
shared_bonus = 0.0
validity_bonus = 0.0
# Adjust confidence based on certificate issuer reputation (if shared certificates exist)
issuer_bonus = 0.0
if shared_certificates:
for cert in shared_certificates:
issuer = cert.get('issuer_name', '').lower()
if any(trusted_ca in issuer for trusted_ca in ['let\'s encrypt', 'digicert', 'sectigo', 'globalsign']):
issuer_bonus = max(issuer_bonus, 0.03)
break
# Calculate final confidence
final_confidence = base_confidence + context_bonus + shared_bonus + validity_bonus + issuer_bonus
return max(0.1, min(1.0, final_confidence)) # Clamp between 0.1 and 1.0
def _determine_relationship_context(self, cert_domain: str, query_domain: str) -> str:
"""
Determine the context of the relationship between certificate domain and query domain.
Args:
cert_domain: Domain found in certificate
query_domain: Original query domain
Returns:
String describing the relationship context
"""
if cert_domain == query_domain:
return 'exact_match'
elif cert_domain.endswith(f'.{query_domain}'):
return 'subdomain'
elif query_domain.endswith(f'.{cert_domain}'):
return 'parent_domain'
else:
return 'related_domain'
def query_ip(self, ip: str) -> List[Tuple[str, str, str, float, Dict[str, Any]]]:
"""
Query crt.sh for certificates containing the IP address.
Note: crt.sh doesn't typically index by IP, so this returns empty results.
Args:
ip: IP address to investigate
Returns:
Empty list (crt.sh doesn't support IP-based certificate queries effectively)
"""
# crt.sh doesn't effectively support IP-based certificate queries
return []
def _extract_domains_from_certificate(self, cert_data: Dict[str, Any]) -> Set[str]:
"""
Extract all domains from certificate data.
Args:
cert_data: Certificate data from crt.sh API
Returns:
Set of unique domain names found in the certificate
"""
domains = set()
# Extract from common name
common_name = cert_data.get('common_name', '')
if common_name:
cleaned_cn = self._clean_domain_name(common_name)
if cleaned_cn:
domains.update(cleaned_cn)
# Extract from name_value field (contains SANs)
name_value = cert_data.get('name_value', '')
if name_value:
# Split by newlines and clean each domain
for line in name_value.split('\n'):
cleaned_domains = self._clean_domain_name(line.strip())
if cleaned_domains:
domains.update(cleaned_domains)
return domains
def _clean_domain_name(self, domain_name: str) -> List[str]:
"""
Clean and normalize domain name from certificate data.
Now returns a list to handle wildcards correctly.
"""
if not domain_name:
return []
domain = domain_name.strip().lower()
# Remove protocol if present
if domain.startswith(('http://', 'https://')):
domain = domain.split('://', 1)[1]
# Remove path if present
if '/' in domain:
domain = domain.split('/', 1)[0]
# Remove port if present
if ':' in domain and not domain.count(':') > 1: # Avoid breaking IPv6
domain = domain.split(':', 1)[0]
# Handle wildcard domains
cleaned_domains = []
if domain.startswith('*.'):
# Add both the wildcard and the base domain
cleaned_domains.append(domain)
cleaned_domains.append(domain[2:])
else:
cleaned_domains.append(domain)
# Remove any remaining invalid characters and validate
final_domains = []
for d in cleaned_domains:
d = re.sub(r'[^\w\-\.]', '', d)
if d and not d.startswith(('.', '-')) and not d.endswith(('.', '-')):
final_domains.append(d)
return [d for d in final_domains if _is_valid_domain(d)]