fixes to hint for incomplete data

This commit is contained in:
overcuriousity
2025-09-19 12:35:28 +02:00
parent eabb532557
commit 7472e6f416
3 changed files with 105 additions and 11 deletions

View File

@@ -3,7 +3,7 @@
import json
import re
from pathlib import Path
from typing import List, Dict, Any, Set
from typing import List, Dict, Any, Set, Optional
from urllib.parse import quote
from datetime import datetime, timezone
import requests
@@ -285,6 +285,17 @@ class CrtShProvider(BaseProvider):
if self._stop_event and self._stop_event.is_set():
self.logger.logger.info(f"CrtSh processing cancelled before processing for domain: {query_domain}")
return result
incompleteness_warning = self._check_for_incomplete_data(query_domain, certificates)
if incompleteness_warning:
result.add_attribute(
target_node=query_domain,
name="crtsh_data_warning",
value=incompleteness_warning,
attr_type='metadata',
provider=self.name,
confidence=1.0
)
all_discovered_domains = set()
processed_issuers = set()
@@ -577,4 +588,30 @@ class CrtShProvider(BaseProvider):
elif query_domain.endswith(f'.{cert_domain}'):
return 'parent_domain'
else:
return 'related_domain'
return 'related_domain'
def _check_for_incomplete_data(self, domain: str, certificates: List[Dict[str, Any]]) -> Optional[str]:
"""
Analyzes the certificate list to heuristically detect if the data from crt.sh is incomplete.
"""
cert_count = len(certificates)
# Heuristic 1: Check if the number of certs hits a known hard limit.
if cert_count >= 10000:
return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit."
# Heuristic 2: Check if all returned certificates are old.
if cert_count > 1000: # Only apply this for a reasonable number of certs
latest_expiry = None
for cert in certificates:
try:
not_after = self._parse_certificate_date(cert.get('not_after'))
if latest_expiry is None or not_after > latest_expiry:
latest_expiry = not_after
except (ValueError, TypeError):
continue
if latest_expiry and (datetime.now(timezone.utc) - latest_expiry).days > 365:
return f"Incomplete data suspected: The latest certificate expired more than a year ago ({latest_expiry.strftime('%Y-%m-%d')})."
return None