This commit is contained in:
overcuriousity 2025-09-19 14:28:37 +02:00
parent 7472e6f416
commit 8d402ab4b1
2 changed files with 82 additions and 5 deletions

View File

@ -2,15 +2,37 @@
import json import json
import re import re
import psycopg2
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any, Set, Optional from typing import List, Dict, Any, Set, Optional
from urllib.parse import quote from urllib.parse import quote
from datetime import datetime, timezone from datetime import datetime, timezone
import requests import requests
from psycopg2 import pool
from .base_provider import BaseProvider from .base_provider import BaseProvider
from core.provider_result import ProviderResult from core.provider_result import ProviderResult
from utils.helpers import _is_valid_domain from utils.helpers import _is_valid_domain
from core.logger import get_forensic_logger
# --- Global Instance for PostgreSQL Connection Pool ---
# This pool will be created once per worker process and is not part of the
# CrtShProvider instance, thus avoiding pickling errors.
db_pool = None
try:
db_pool = psycopg2.pool.SimpleConnectionPool(
1, 5,
host='crt.sh',
port=5432,
user='guest',
dbname='certwatch',
sslmode='prefer',
connect_timeout=60
)
# Use a generic logger here as this is at the module level
get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.")
except Exception as e:
get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.")
class CrtShProvider(BaseProvider): class CrtShProvider(BaseProvider):
@ -121,7 +143,7 @@ class CrtShProvider(BaseProvider):
else: # "stale" or "not_found" else: # "stale" or "not_found"
# Query the API for the latest certificates # Query the API for the latest certificates
new_raw_certs = self._query_crtsh_api(domain) new_raw_certs = self._query_crtsh(domain)
if self._stop_event and self._stop_event.is_set(): if self._stop_event and self._stop_event.is_set():
return ProviderResult() return ProviderResult()
@ -152,8 +174,8 @@ class CrtShProvider(BaseProvider):
# Save the new result and the raw data to the cache # Save the new result and the raw data to the cache
self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain) self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain)
except requests.exceptions.RequestException as e: except (requests.exceptions.RequestException, psycopg2.Error) as e:
self.logger.logger.error(f"API query failed for {domain}: {e}") self.logger.logger.error(f"Upstream query failed for {domain}: {e}")
if cache_status != "not_found": if cache_status != "not_found":
result = self._load_from_cache(cache_file) result = self._load_from_cache(cache_file)
self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.") self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.")
@ -256,6 +278,58 @@ class CrtShProvider(BaseProvider):
except Exception as e: except Exception as e:
self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}") self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}")
def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh, trying the database first and falling back to the API."""
global db_pool
if db_pool:
try:
self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}")
return self._query_crtsh_db(domain)
except psycopg2.Error as e:
self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.")
return self._query_crtsh_api(domain)
else:
self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}")
return self._query_crtsh_api(domain)
def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh database for raw certificate data."""
global db_pool
conn = db_pool.getconn()
try:
with conn.cursor() as cursor:
query = """
SELECT
c.id,
x509_serialnumber(c.certificate) as serial_number,
x509_notbefore(c.certificate) as not_before,
x509_notafter(c.certificate) as not_after,
c.issuer_ca_id,
ca.name as issuer_name,
x509_commonname(c.certificate) as common_name,
identities(c.certificate)::text as name_value
FROM certificate c
LEFT JOIN ca ON c.issuer_ca_id = ca.id
WHERE identities(c.certificate) @@ plainto_tsquery(%s)
ORDER BY c.id DESC
LIMIT 5000;
"""
cursor.execute(query, (domain,))
results = []
columns = [desc[0] for desc in cursor.description]
for row in cursor.fetchall():
row_dict = dict(zip(columns, row))
if row_dict.get('not_before'):
row_dict['not_before'] = row_dict['not_before'].isoformat()
if row_dict.get('not_after'):
row_dict['not_after'] = row_dict['not_after'].isoformat()
results.append(row_dict)
self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.")
return results
finally:
db_pool.putconn(conn)
def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]: def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]:
"""Query crt.sh API for raw certificate data.""" """Query crt.sh API for raw certificate data."""
url = f"{self.base_url}?q={quote(domain)}&output=json" url = f"{self.base_url}?q={quote(domain)}&output=json"
@ -468,6 +542,8 @@ class CrtShProvider(BaseProvider):
raise ValueError("Empty date string") raise ValueError("Empty date string")
try: try:
if isinstance(date_string, datetime):
return date_string.replace(tzinfo=timezone.utc)
if date_string.endswith('Z'): if date_string.endswith('Z'):
return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc) return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc)
elif '+' in date_string or date_string.endswith('UTC'): elif '+' in date_string or date_string.endswith('UTC'):

View File

@ -8,3 +8,4 @@ dnspython
gunicorn gunicorn
redis redis
python-dotenv python-dotenv
psycopg2-binary