# dnsrecon/providers/crtsh_provider.py import json import re import psycopg2 from pathlib import Path from typing import List, Dict, Any, Set, Optional from urllib.parse import quote from datetime import datetime, timezone import requests from psycopg2 import pool from .base_provider import BaseProvider from core.provider_result import ProviderResult from utils.helpers import _is_valid_domain from core.logger import get_forensic_logger # --- Global Instance for PostgreSQL Connection Pool --- # This pool will be created once per worker process and is not part of the # CrtShProvider instance, thus avoiding pickling errors. db_pool = None try: db_pool = psycopg2.pool.SimpleConnectionPool( 1, 5, host='crt.sh', port=5432, user='guest', dbname='certwatch', sslmode='prefer', connect_timeout=60 ) # Use a generic logger here as this is at the module level get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.") except Exception as e: get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.") class CrtShProvider(BaseProvider): """ Provider for querying crt.sh certificate transparency database. FIXED: Now properly creates domain and CA nodes instead of large entities. Returns standardized ProviderResult objects with caching support. """ def __init__(self, name=None, session_config=None): """Initialize CrtSh provider with session-specific configuration.""" super().__init__( name="crtsh", rate_limit=60, timeout=15, session_config=session_config ) self.base_url = "https://crt.sh/" self._stop_event = None # Initialize cache directory (separate from BaseProvider's HTTP cache) self.domain_cache_dir = Path('cache') / 'crtsh' self.domain_cache_dir.mkdir(parents=True, exist_ok=True) # Compile regex for date filtering for efficiency self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}') def get_name(self) -> str: """Return the provider name.""" return "crtsh" def get_display_name(self) -> str: """Return the provider display name for the UI.""" return "crt.sh" def requires_api_key(self) -> bool: """Return True if the provider requires an API key.""" return False def get_eligibility(self) -> Dict[str, bool]: """Return a dictionary indicating if the provider can query domains and/or IPs.""" return {'domains': True, 'ips': False} def is_available(self) -> bool: """Check if the provider is configured to be used.""" return True def _get_cache_file_path(self, domain: str) -> Path: """Generate cache file path for a domain.""" safe_domain = domain.replace('.', '_').replace('/', '_').replace('\\', '_') return self.domain_cache_dir / f"{safe_domain}.json" def _get_cache_status(self, cache_file_path: Path) -> str: """ Check cache status for a domain. Returns: 'not_found', 'fresh', or 'stale' """ if not cache_file_path.exists(): return "not_found" try: with open(cache_file_path, 'r') as f: cache_data = json.load(f) last_query_str = cache_data.get("last_upstream_query") if not last_query_str: return "stale" last_query = datetime.fromisoformat(last_query_str.replace('Z', '+00:00')) hours_since_query = (datetime.now(timezone.utc) - last_query).total_seconds() / 3600 cache_timeout = self.config.cache_timeout_hours if hours_since_query < cache_timeout: return "fresh" else: return "stale" except (json.JSONDecodeError, ValueError, KeyError) as e: self.logger.logger.warning(f"Invalid cache file format for {cache_file_path}: {e}") return "stale" def query_domain(self, domain: str) -> ProviderResult: """ FIXED: Query crt.sh for certificates containing the domain. Now properly creates domain and CA nodes instead of large entities. Args: domain: Domain to investigate Returns: ProviderResult containing discovered relationships and attributes """ if not _is_valid_domain(domain): return ProviderResult() if self._stop_event and self._stop_event.is_set(): return ProviderResult() cache_file = self._get_cache_file_path(domain) cache_status = self._get_cache_status(cache_file) result = ProviderResult() try: if cache_status == "fresh": result = self._load_from_cache(cache_file) self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}") else: # "stale" or "not_found" # Query the API for the latest certificates new_raw_certs = self._query_crtsh(domain) if self._stop_event and self._stop_event.is_set(): return ProviderResult() # Combine with old data if cache is stale if cache_status == "stale": old_raw_certs = self._load_raw_data_from_cache(cache_file) combined_certs = old_raw_certs + new_raw_certs # Deduplicate the combined list seen_ids = set() unique_certs = [] for cert in combined_certs: cert_id = cert.get('id') if cert_id not in seen_ids: unique_certs.append(cert) seen_ids.add(cert_id) raw_certificates_to_process = unique_certs self.logger.logger.info(f"Refreshed and merged cache for {domain}. Total unique certs: {len(raw_certificates_to_process)}") else: # "not_found" raw_certificates_to_process = new_raw_certs # FIXED: Process certificates to create proper domain and CA nodes result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process) self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)") # Save the new result and the raw data to the cache self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) except (requests.exceptions.RequestException, psycopg2.Error) as e: self.logger.logger.error(f"Upstream query failed for {domain}: {e}") if cache_status != "not_found": result = self._load_from_cache(cache_file) self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.") else: raise e # Re-raise if there's no cache to fall back on return result def query_ip(self, ip: str) -> ProviderResult: """ Query crt.sh for certificates containing the IP address. Note: crt.sh doesn't typically index by IP, so this returns empty results. Args: ip: IP address to investigate Returns: Empty ProviderResult (crt.sh doesn't support IP-based certificate queries effectively) """ return ProviderResult() def _load_from_cache(self, cache_file_path: Path) -> ProviderResult: """Load processed crt.sh data from a cache file.""" try: with open(cache_file_path, 'r') as f: cache_content = json.load(f) result = ProviderResult() # Reconstruct relationships for rel_data in cache_content.get("relationships", []): result.add_relationship( source_node=rel_data["source_node"], target_node=rel_data["target_node"], relationship_type=rel_data["relationship_type"], provider=rel_data["provider"], confidence=rel_data["confidence"], raw_data=rel_data.get("raw_data", {}) ) # Reconstruct attributes for attr_data in cache_content.get("attributes", []): result.add_attribute( target_node=attr_data["target_node"], name=attr_data["name"], value=attr_data["value"], attr_type=attr_data["type"], provider=attr_data["provider"], confidence=attr_data["confidence"], metadata=attr_data.get("metadata", {}) ) return result except (json.JSONDecodeError, FileNotFoundError, KeyError) as e: self.logger.logger.error(f"Failed to load cached certificates from {cache_file_path}: {e}") return ProviderResult() def _load_raw_data_from_cache(self, cache_file_path: Path) -> List[Dict[str, Any]]: """Load only the raw certificate data from a cache file.""" try: with open(cache_file_path, 'r') as f: cache_content = json.load(f) return cache_content.get("raw_certificates", []) except (json.JSONDecodeError, FileNotFoundError): return [] def _save_result_to_cache(self, cache_file_path: Path, result: ProviderResult, raw_certificates: List[Dict[str, Any]], domain: str) -> None: """Save processed crt.sh result and raw data to a cache file.""" try: cache_data = { "domain": domain, "last_upstream_query": datetime.now(timezone.utc).isoformat(), "raw_certificates": raw_certificates, # Store the raw data for deduplication "relationships": [ { "source_node": rel.source_node, "target_node": rel.target_node, "relationship_type": rel.relationship_type, "confidence": rel.confidence, "provider": rel.provider, "raw_data": rel.raw_data } for rel in result.relationships ], "attributes": [ { "target_node": attr.target_node, "name": attr.name, "value": attr.value, "type": attr.type, "provider": attr.provider, "confidence": attr.confidence, "metadata": attr.metadata } for attr in result.attributes ] } cache_file_path.parent.mkdir(parents=True, exist_ok=True) with open(cache_file_path, 'w') as f: json.dump(cache_data, f, separators=(',', ':'), default=str) except Exception as e: self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}") def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]: """Query crt.sh, trying the database first and falling back to the API.""" global db_pool if db_pool: try: self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}") return self._query_crtsh_db(domain) except psycopg2.Error as e: self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.") return self._query_crtsh_api(domain) else: self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}") return self._query_crtsh_api(domain) def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]: """Query crt.sh database for raw certificate data.""" global db_pool conn = db_pool.getconn() try: with conn.cursor() as cursor: query = """ SELECT c.id, x509_serialnumber(c.certificate) as serial_number, x509_notbefore(c.certificate) as not_before, x509_notafter(c.certificate) as not_after, c.issuer_ca_id, ca.name as issuer_name, x509_commonname(c.certificate) as common_name, identities(c.certificate)::text as name_value FROM certificate c LEFT JOIN ca ON c.issuer_ca_id = ca.id WHERE identities(c.certificate) @@ plainto_tsquery(%s) ORDER BY c.id DESC LIMIT 5000; """ cursor.execute(query, (domain,)) results = [] columns = [desc[0] for desc in cursor.description] for row in cursor.fetchall(): row_dict = dict(zip(columns, row)) if row_dict.get('not_before'): row_dict['not_before'] = row_dict['not_before'].isoformat() if row_dict.get('not_after'): row_dict['not_after'] = row_dict['not_after'].isoformat() results.append(row_dict) self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.") return results finally: db_pool.putconn(conn) def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]: """Query crt.sh API for raw certificate data.""" url = f"{self.base_url}?q={quote(domain)}&output=json" response = self.make_request(url, target_indicator=domain) if not response or response.status_code != 200: raise requests.exceptions.RequestException(f"crt.sh API returned status {response.status_code if response else 'None'}") try: certificates = response.json() except json.JSONDecodeError: self.logger.logger.error(f"crt.sh returned invalid JSON for {domain}") return [] if not certificates: return [] return certificates def _process_certificates_to_result_fixed(self, query_domain: str, certificates: List[Dict[str, Any]]) -> ProviderResult: """ FIXED: Process certificates to create proper domain and CA nodes. Now creates individual domain nodes instead of large entities. """ result = ProviderResult() if self._stop_event and self._stop_event.is_set(): self.logger.logger.info(f"CrtSh processing cancelled before processing for domain: {query_domain}") return result incompleteness_warning = self._check_for_incomplete_data(query_domain, certificates) if incompleteness_warning: result.add_attribute( target_node=query_domain, name="crtsh_data_warning", value=incompleteness_warning, attr_type='metadata', provider=self.name, confidence=1.0 ) all_discovered_domains = set() processed_issuers = set() for i, cert_data in enumerate(certificates): if i % 10 == 0 and self._stop_event and self._stop_event.is_set(): self.logger.logger.info(f"CrtSh processing cancelled at certificate {i} for domain: {query_domain}") break # Extract all domains from this certificate cert_domains = self._extract_domains_from_certificate(cert_data) all_discovered_domains.update(cert_domains) # FIXED: Create CA nodes for certificate issuers (not as domain metadata) issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', '')) if issuer_name and issuer_name not in processed_issuers: # Create relationship from query domain to CA result.add_relationship( source_node=query_domain, target_node=issuer_name, relationship_type='crtsh_cert_issuer', provider=self.name, confidence=0.95, raw_data={'issuer_dn': cert_data.get('issuer_name', '')} ) processed_issuers.add(issuer_name) # Add certificate metadata to each domain in this certificate cert_metadata = self._extract_certificate_metadata(cert_data) for cert_domain in cert_domains: if not _is_valid_domain(cert_domain): continue # Add certificate attributes to the domain for key, value in cert_metadata.items(): if value is not None: result.add_attribute( target_node=cert_domain, name=f"cert_{key}", value=value, attr_type='certificate_data', provider=self.name, confidence=0.9, metadata={'certificate_id': cert_data.get('id')} ) if self._stop_event and self._stop_event.is_set(): self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}") return result # FIXED: Create selective relationships to avoid large entities # Only create relationships to domains that are closely related for discovered_domain in all_discovered_domains: if discovered_domain == query_domain: continue if not _is_valid_domain(discovered_domain): continue # FIXED: Only create relationships for domains that share a meaningful connection # This prevents creating too many relationships that trigger large entity creation if self._should_create_relationship(query_domain, discovered_domain): confidence = self._calculate_domain_relationship_confidence( query_domain, discovered_domain, [], all_discovered_domains ) result.add_relationship( source_node=query_domain, target_node=discovered_domain, relationship_type='crtsh_san_certificate', provider=self.name, confidence=confidence, raw_data={'relationship_type': 'certificate_discovery'} ) self.log_relationship_discovery( source_node=query_domain, target_node=discovered_domain, relationship_type='crtsh_san_certificate', confidence_score=confidence, raw_data={'relationship_type': 'certificate_discovery'}, discovery_method="certificate_transparency_analysis" ) self.logger.logger.info(f"CrtSh processing completed for {query_domain}: {len(all_discovered_domains)} domains, {result.get_relationship_count()} relationships") return result def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool: """ FIXED: Determine if a relationship should be created between two domains. This helps avoid creating too many relationships that trigger large entity creation. """ # Always create relationships for subdomains if target_domain.endswith(f'.{source_domain}') or source_domain.endswith(f'.{target_domain}'): return True # Create relationships for domains that share a common parent (up to 2 levels) source_parts = source_domain.split('.') target_parts = target_domain.split('.') # Check if they share the same root domain (last 2 parts) if len(source_parts) >= 2 and len(target_parts) >= 2: source_root = '.'.join(source_parts[-2:]) target_root = '.'.join(target_parts[-2:]) return source_root == target_root return False def _extract_certificate_metadata(self, cert_data: Dict[str, Any]) -> Dict[str, Any]: """Extract comprehensive metadata from certificate data.""" raw_issuer_name = cert_data.get('issuer_name', '') parsed_issuer_name = self._parse_issuer_organization(raw_issuer_name) metadata = { 'certificate_id': cert_data.get('id'), 'serial_number': cert_data.get('serial_number'), 'issuer_name': parsed_issuer_name, 'issuer_ca_id': cert_data.get('issuer_ca_id'), 'common_name': cert_data.get('common_name'), 'not_before': cert_data.get('not_before'), 'not_after': cert_data.get('not_after'), 'entry_timestamp': cert_data.get('entry_timestamp'), 'source': 'crtsh' } try: if metadata['not_before'] and metadata['not_after']: not_before = self._parse_certificate_date(metadata['not_before']) not_after = self._parse_certificate_date(metadata['not_after']) metadata['validity_period_days'] = (not_after - not_before).days metadata['is_currently_valid'] = self._is_cert_valid(cert_data) metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30 # Keep raw date format or convert to standard format metadata['not_before'] = not_before.isoformat() metadata['not_after'] = not_after.isoformat() except Exception as e: self.logger.logger.debug(f"Error computing certificate metadata: {e}") metadata['is_currently_valid'] = False metadata['expires_soon'] = False return metadata def _parse_issuer_organization(self, issuer_dn: str) -> str: """Parse the issuer Distinguished Name to extract just the organization name.""" if not issuer_dn: return issuer_dn try: components = [comp.strip() for comp in issuer_dn.split(',')] for component in components: if component.startswith('O='): org_name = component[2:].strip() if org_name.startswith('"') and org_name.endswith('"'): org_name = org_name[1:-1] return org_name return issuer_dn except Exception as e: self.logger.logger.debug(f"Failed to parse issuer DN '{issuer_dn}': {e}") return issuer_dn def _parse_certificate_date(self, date_string: str) -> datetime: """Parse certificate date from crt.sh format.""" if not date_string: raise ValueError("Empty date string") try: if isinstance(date_string, datetime): return date_string.replace(tzinfo=timezone.utc) if date_string.endswith('Z'): return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc) elif '+' in date_string or date_string.endswith('UTC'): date_string = date_string.replace('UTC', '').strip() if '+' in date_string: date_string = date_string.split('+')[0] return datetime.fromisoformat(date_string).replace(tzinfo=timezone.utc) else: return datetime.fromisoformat(date_string).replace(tzinfo=timezone.utc) except Exception as e: try: return datetime.strptime(date_string[:19], "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc) except Exception: raise ValueError(f"Unable to parse date: {date_string}") from e def _is_cert_valid(self, cert_data: Dict[str, Any]) -> bool: """Check if a certificate is currently valid based on its expiry date.""" try: not_after_str = cert_data.get('not_after') if not not_after_str: return False not_after_date = self._parse_certificate_date(not_after_str) not_before_str = cert_data.get('not_before') now = datetime.now(timezone.utc) is_not_expired = not_after_date > now if not_before_str: not_before_date = self._parse_certificate_date(not_before_str) is_not_before_valid = not_before_date <= now return is_not_expired and is_not_before_valid return is_not_expired except Exception as e: return False def _extract_domains_from_certificate(self, cert_data: Dict[str, Any]) -> Set[str]: """Extract all domains from certificate data.""" domains = set() # Extract from common name common_name = cert_data.get('common_name', '') if common_name: cleaned_cn = self._clean_domain_name(common_name) if cleaned_cn: domains.update(cleaned_cn) # Extract from name_value field (contains SANs) name_value = cert_data.get('name_value', '') if name_value: for line in name_value.split('\n'): cleaned_domains = self._clean_domain_name(line.strip()) if cleaned_domains: domains.update(cleaned_domains) return domains def _clean_domain_name(self, domain_name: str) -> List[str]: """Clean and normalize domain name from certificate data.""" if not domain_name: return [] domain = domain_name.strip().lower() if domain.startswith(('http://', 'https://')): domain = domain.split('://', 1)[1] if '/' in domain: domain = domain.split('/', 1)[0] if ':' in domain and not domain.count(':') > 1: domain = domain.split(':', 1)[0] cleaned_domains = [] if domain.startswith('*.'): cleaned_domains.append(domain) cleaned_domains.append(domain[2:]) else: cleaned_domains.append(domain) final_domains = [] for d in cleaned_domains: d = re.sub(r'[^\w\-\.]', '', d) if d and not d.startswith(('.', '-')) and not d.endswith(('.', '-')): final_domains.append(d) return [d for d in final_domains if _is_valid_domain(d)] def _calculate_domain_relationship_confidence(self, domain1: str, domain2: str, shared_certificates: List[Dict[str, Any]], all_discovered_domains: Set[str]) -> float: """Calculate confidence score for domain relationship based on various factors.""" base_confidence = 0.9 # Adjust confidence based on domain relationship context relationship_context = self._determine_relationship_context(domain2, domain1) if relationship_context == 'exact_match': context_bonus = 0.0 elif relationship_context == 'subdomain': context_bonus = 0.1 elif relationship_context == 'parent_domain': context_bonus = 0.05 else: context_bonus = 0.0 final_confidence = base_confidence + context_bonus return max(0.1, min(1.0, final_confidence)) def _determine_relationship_context(self, cert_domain: str, query_domain: str) -> str: """Determine the context of the relationship between certificate domain and query domain.""" if cert_domain == query_domain: return 'exact_match' elif cert_domain.endswith(f'.{query_domain}'): return 'subdomain' elif query_domain.endswith(f'.{cert_domain}'): return 'parent_domain' else: return 'related_domain' def _check_for_incomplete_data(self, domain: str, certificates: List[Dict[str, Any]]) -> Optional[str]: """ Analyzes the certificate list to heuristically detect if the data from crt.sh is incomplete. """ cert_count = len(certificates) # Heuristic 1: Check if the number of certs hits a known hard limit. if cert_count >= 10000: return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit." # Heuristic 2: Check if all returned certificates are old. if cert_count > 1000: # Only apply this for a reasonable number of certs latest_expiry = None for cert in certificates: try: not_after = self._parse_certificate_date(cert.get('not_after')) if latest_expiry is None or not_after > latest_expiry: latest_expiry = not_after except (ValueError, TypeError): continue if latest_expiry and (datetime.now(timezone.utc) - latest_expiry).days > 365: return f"Incomplete data suspected: The latest certificate expired more than a year ago ({latest_expiry.strftime('%Y-%m-%d')})." return None