# dnsrecon/providers/crtsh_provider.py

import json
import re
import psycopg2
from pathlib import Path
from typing import List, Dict, Any, Set, Optional
from urllib.parse import quote
from datetime import datetime, timezone
import requests
from psycopg2 import pool

from .base_provider import BaseProvider
from core.provider_result import ProviderResult
from utils.helpers import _is_valid_domain
from core.logger import get_forensic_logger

# --- Global Instance for PostgreSQL Connection Pool ---
# This pool will be created once per worker process and is not part of the
# CrtShProvider instance, thus avoiding pickling errors.
db_pool = None
try:
    db_pool = psycopg2.pool.SimpleConnectionPool(
        1, 5,
        host='crt.sh',
        port=5432,
        user='guest',
        dbname='certwatch',
        sslmode='prefer',
        connect_timeout=60
    )
    # Use a generic logger here as this is at the module level
    get_forensic_logger().logger.info("crt.sh: Global PostgreSQL connection pool created successfully.")
except Exception as e:
    get_forensic_logger().logger.warning(f"crt.sh: Failed to create global DB connection pool: {e}. Will fall back to HTTP API.")


class CrtShProvider(BaseProvider):
    """
    Provider for querying crt.sh certificate transparency database.
    FIXED: Now properly creates domain and CA nodes instead of large entities.
    Returns standardized ProviderResult objects with caching support.
    """
    
    def __init__(self, name=None, session_config=None):
        """Initialize CrtSh provider with session-specific configuration."""
        super().__init__(
            name="crtsh",
            rate_limit=60,
            timeout=15,
            session_config=session_config
        )
        self.base_url = "https://crt.sh/"
        self._stop_event = None
        
        # Initialize cache directory (separate from BaseProvider's HTTP cache)
        self.domain_cache_dir = Path('cache') / 'crtsh'
        self.domain_cache_dir.mkdir(parents=True, exist_ok=True)
        
        # Compile regex for date filtering for efficiency
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')

    def get_name(self) -> str:
        """Return the provider name."""
        return "crtsh"
    
    def get_display_name(self) -> str:
        """Return the provider display name for the UI."""
        return "crt.sh"

    def requires_api_key(self) -> bool:
        """Return True if the provider requires an API key."""
        return False

    def get_eligibility(self) -> Dict[str, bool]:
        """Return a dictionary indicating if the provider can query domains and/or IPs."""
        return {'domains': True, 'ips': False}

    def is_available(self) -> bool:
        """Check if the provider is configured to be used."""
        return True
    
    def _get_cache_file_path(self, domain: str) -> Path:
        """Generate cache file path for a domain."""
        safe_domain = domain.replace('.', '_').replace('/', '_').replace('\\', '_')
        return self.domain_cache_dir / f"{safe_domain}.json"
    
    def _get_cache_status(self, cache_file_path: Path) -> str:
        """
        Check cache status for a domain.
        Returns: 'not_found', 'fresh', or 'stale'
        """
        if not cache_file_path.exists():
            return "not_found"
        
        try:
            with open(cache_file_path, 'r') as f:
                cache_data = json.load(f)
            
            last_query_str = cache_data.get("last_upstream_query")
            if not last_query_str:
                return "stale"
            
            last_query = datetime.fromisoformat(last_query_str.replace('Z', '+00:00'))
            hours_since_query = (datetime.now(timezone.utc) - last_query).total_seconds() / 3600
            
            cache_timeout = self.config.cache_timeout_hours
            if hours_since_query < cache_timeout:
                return "fresh"
            else:
                return "stale"
                
        except (json.JSONDecodeError, ValueError, KeyError) as e:
            self.logger.logger.warning(f"Invalid cache file format for {cache_file_path}: {e}")
            return "stale"

    def query_domain(self, domain: str) -> ProviderResult:
        """
        FIXED: Query crt.sh for certificates containing the domain.
        Now properly creates domain and CA nodes instead of large entities.
        
        Args:
            domain: Domain to investigate
            
        Returns:
            ProviderResult containing discovered relationships and attributes
        """
        if not _is_valid_domain(domain):
            return ProviderResult()
        
        if self._stop_event and self._stop_event.is_set():
            return ProviderResult()

        cache_file = self._get_cache_file_path(domain)
        cache_status = self._get_cache_status(cache_file)
        
        result = ProviderResult()

        try:
            if cache_status == "fresh":
                result = self._load_from_cache(cache_file)
                self.logger.logger.info(f"Using fresh cached crt.sh data for {domain}")
            
            else:  # "stale" or "not_found"
                # Query the API for the latest certificates
                new_raw_certs = self._query_crtsh(domain)
                
                if self._stop_event and self._stop_event.is_set():
                    return ProviderResult()

                # Combine with old data if cache is stale
                if cache_status == "stale":
                    old_raw_certs = self._load_raw_data_from_cache(cache_file)
                    combined_certs = old_raw_certs + new_raw_certs
                    
                    # Deduplicate the combined list
                    seen_ids = set()
                    unique_certs = []
                    for cert in combined_certs:
                        cert_id = cert.get('id')
                        if cert_id not in seen_ids:
                            unique_certs.append(cert)
                            seen_ids.add(cert_id)
                    
                    raw_certificates_to_process = unique_certs
                    self.logger.logger.info(f"Refreshed and merged cache for {domain}. Total unique certs: {len(raw_certificates_to_process)}")
                else:  # "not_found"
                    raw_certificates_to_process = new_raw_certs
                
                # FIXED: Process certificates to create proper domain and CA nodes
                result = self._process_certificates_to_result_fixed(domain, raw_certificates_to_process)
                self.logger.logger.info(f"Created fresh result for {domain} ({result.get_relationship_count()} relationships)")

                # Save the new result and the raw data to the cache
                self._save_result_to_cache(cache_file, result, raw_certificates_to_process, domain)

        except (requests.exceptions.RequestException, psycopg2.Error) as e:
            self.logger.logger.error(f"Upstream query failed for {domain}: {e}")
            if cache_status != "not_found":
                result = self._load_from_cache(cache_file)
                self.logger.logger.warning(f"Using stale cache for {domain} due to API failure.")
            else:
                raise e  # Re-raise if there's no cache to fall back on

        return result

    def query_ip(self, ip: str) -> ProviderResult:
        """
        Query crt.sh for certificates containing the IP address.
        Note: crt.sh doesn't typically index by IP, so this returns empty results.
        
        Args:
            ip: IP address to investigate
            
        Returns:
            Empty ProviderResult (crt.sh doesn't support IP-based certificate queries effectively)
        """
        return ProviderResult()

    def _load_from_cache(self, cache_file_path: Path) -> ProviderResult:
        """Load processed crt.sh data from a cache file."""
        try:
            with open(cache_file_path, 'r') as f:
                cache_content = json.load(f)
            
            result = ProviderResult()
            
            # Reconstruct relationships
            for rel_data in cache_content.get("relationships", []):
                result.add_relationship(
                    source_node=rel_data["source_node"],
                    target_node=rel_data["target_node"],
                    relationship_type=rel_data["relationship_type"],
                    provider=rel_data["provider"],
                    confidence=rel_data["confidence"],
                    raw_data=rel_data.get("raw_data", {})
                )
            
            # Reconstruct attributes
            for attr_data in cache_content.get("attributes", []):
                result.add_attribute(
                    target_node=attr_data["target_node"],
                    name=attr_data["name"],
                    value=attr_data["value"],
                    attr_type=attr_data["type"],
                    provider=attr_data["provider"],
                    confidence=attr_data["confidence"],
                    metadata=attr_data.get("metadata", {})
                )
            
            return result
            
        except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
            self.logger.logger.error(f"Failed to load cached certificates from {cache_file_path}: {e}")
            return ProviderResult()

    def _load_raw_data_from_cache(self, cache_file_path: Path) -> List[Dict[str, Any]]:
        """Load only the raw certificate data from a cache file."""
        try:
            with open(cache_file_path, 'r') as f:
                cache_content = json.load(f)
            return cache_content.get("raw_certificates", [])
        except (json.JSONDecodeError, FileNotFoundError):
            return []

    def _save_result_to_cache(self, cache_file_path: Path, result: ProviderResult, raw_certificates: List[Dict[str, Any]], domain: str) -> None:
        """Save processed crt.sh result and raw data to a cache file."""
        try:
            cache_data = {
                "domain": domain,
                "last_upstream_query": datetime.now(timezone.utc).isoformat(),
                "raw_certificates": raw_certificates, # Store the raw data for deduplication
                "relationships": [
                    {
                        "source_node": rel.source_node,
                        "target_node": rel.target_node,
                        "relationship_type": rel.relationship_type,
                        "confidence": rel.confidence,
                        "provider": rel.provider,
                        "raw_data": rel.raw_data
                    } for rel in result.relationships
                ],
                "attributes": [
                    {
                        "target_node": attr.target_node,
                        "name": attr.name,
                        "value": attr.value,
                        "type": attr.type,
                        "provider": attr.provider,
                        "confidence": attr.confidence,
                        "metadata": attr.metadata
                    } for attr in result.attributes
                ]
            }
            cache_file_path.parent.mkdir(parents=True, exist_ok=True)
            with open(cache_file_path, 'w') as f:
                json.dump(cache_data, f, separators=(',', ':'), default=str)
        except Exception as e:
            self.logger.logger.warning(f"Failed to save cache file for {domain}: {e}")
            
    def _query_crtsh(self, domain: str) -> List[Dict[str, Any]]:
        """Query crt.sh, trying the database first and falling back to the API."""
        global db_pool
        if db_pool:
            try:
                self.logger.logger.info(f"crt.sh: Attempting DB query for {domain}")
                return self._query_crtsh_db(domain)
            except psycopg2.Error as e:
                self.logger.logger.warning(f"crt.sh: DB query failed for {domain}: {e}. Falling back to HTTP API.")
                return self._query_crtsh_api(domain)
        else:
            self.logger.logger.info(f"crt.sh: No DB connection pool. Using HTTP API for {domain}")
            return self._query_crtsh_api(domain)

    def _query_crtsh_db(self, domain: str) -> List[Dict[str, Any]]:
        """Query crt.sh database for raw certificate data."""
        global db_pool
        conn = db_pool.getconn()
        try:
            with conn.cursor() as cursor:
                query = """
                SELECT
                    c.id,
                    x509_serialnumber(c.certificate) as serial_number,
                    x509_notbefore(c.certificate) as not_before,
                    x509_notafter(c.certificate) as not_after,
                    c.issuer_ca_id,
                    ca.name as issuer_name,
                    x509_commonname(c.certificate) as common_name,
                    identities(c.certificate)::text as name_value
                FROM certificate c
                LEFT JOIN ca ON c.issuer_ca_id = ca.id
                WHERE identities(c.certificate) @@ plainto_tsquery(%s)
                ORDER BY c.id DESC
                LIMIT 5000;
                """
                cursor.execute(query, (domain,))
                
                results = []
                columns = [desc[0] for desc in cursor.description]
                for row in cursor.fetchall():
                    row_dict = dict(zip(columns, row))
                    if row_dict.get('not_before'):
                        row_dict['not_before'] = row_dict['not_before'].isoformat()
                    if row_dict.get('not_after'):
                        row_dict['not_after'] = row_dict['not_after'].isoformat()
                    results.append(row_dict)
                self.logger.logger.info(f"crt.sh: DB query for {domain} returned {len(results)} records.")
                return results
        finally:
            db_pool.putconn(conn)

    def _query_crtsh_api(self, domain: str) -> List[Dict[str, Any]]:
        """Query crt.sh API for raw certificate data."""
        url = f"{self.base_url}?q={quote(domain)}&output=json"
        response = self.make_request(url, target_indicator=domain)
        
        if not response or response.status_code != 200:
            raise requests.exceptions.RequestException(f"crt.sh API returned status {response.status_code if response else 'None'}")
        
        try:
            certificates = response.json()
        except json.JSONDecodeError:
            self.logger.logger.error(f"crt.sh returned invalid JSON for {domain}")
            return []

        if not certificates:
            return []
        
        return certificates

    def _process_certificates_to_result_fixed(self, query_domain: str, certificates: List[Dict[str, Any]]) -> ProviderResult:
        """
        FIXED: Process certificates to create proper domain and CA nodes.
        Now creates individual domain nodes instead of large entities.
        """
        result = ProviderResult()

        if self._stop_event and self._stop_event.is_set():
            self.logger.logger.info(f"CrtSh processing cancelled before processing for domain: {query_domain}")
            return result
        
        incompleteness_warning = self._check_for_incomplete_data(query_domain, certificates)
        if incompleteness_warning:
            result.add_attribute(
                target_node=query_domain,
                name="crtsh_data_warning",
                value=incompleteness_warning,
                attr_type='metadata',
                provider=self.name,
                confidence=1.0
            )

        all_discovered_domains = set()
        processed_issuers = set()

        for i, cert_data in enumerate(certificates):
            if i % 10 == 0 and self._stop_event and self._stop_event.is_set():
                self.logger.logger.info(f"CrtSh processing cancelled at certificate {i} for domain: {query_domain}")
                break

            # Extract all domains from this certificate
            cert_domains = self._extract_domains_from_certificate(cert_data)
            all_discovered_domains.update(cert_domains)

            # FIXED: Create CA nodes for certificate issuers (not as domain metadata)
            issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', ''))
            if issuer_name and issuer_name not in processed_issuers:
                # Create relationship from query domain to CA
                result.add_relationship(
                    source_node=query_domain,
                    target_node=issuer_name,
                    relationship_type='crtsh_cert_issuer',
                    provider=self.name,
                    confidence=0.95,
                    raw_data={'issuer_dn': cert_data.get('issuer_name', '')}
                )
                processed_issuers.add(issuer_name)

            # Add certificate metadata to each domain in this certificate
            cert_metadata = self._extract_certificate_metadata(cert_data)
            for cert_domain in cert_domains:
                if not _is_valid_domain(cert_domain):
                    continue

                # Add certificate attributes to the domain
                for key, value in cert_metadata.items():
                    if value is not None:
                        result.add_attribute(
                            target_node=cert_domain,
                            name=f"cert_{key}",
                            value=value,
                            attr_type='certificate_data',
                            provider=self.name,
                            confidence=0.9,
                            metadata={'certificate_id': cert_data.get('id')}
                        )

        if self._stop_event and self._stop_event.is_set():
            self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}")
            return result

        # FIXED: Create selective relationships to avoid large entities
        # Only create relationships to domains that are closely related
        for discovered_domain in all_discovered_domains:
            if discovered_domain == query_domain:
                continue

            if not _is_valid_domain(discovered_domain):
                continue

            # FIXED: Only create relationships for domains that share a meaningful connection
            # This prevents creating too many relationships that trigger large entity creation
            if self._should_create_relationship(query_domain, discovered_domain):
                confidence = self._calculate_domain_relationship_confidence(
                    query_domain, discovered_domain, [], all_discovered_domains
                )

                result.add_relationship(
                    source_node=query_domain,
                    target_node=discovered_domain,
                    relationship_type='crtsh_san_certificate',
                    provider=self.name,
                    confidence=confidence,
                    raw_data={'relationship_type': 'certificate_discovery'}
                )

                self.log_relationship_discovery(
                    source_node=query_domain,
                    target_node=discovered_domain,
                    relationship_type='crtsh_san_certificate',
                    confidence_score=confidence,
                    raw_data={'relationship_type': 'certificate_discovery'},
                    discovery_method="certificate_transparency_analysis"
                )

        self.logger.logger.info(f"CrtSh processing completed for {query_domain}: {len(all_discovered_domains)} domains, {result.get_relationship_count()} relationships")
        return result

    def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool:
        """
        FIXED: Determine if a relationship should be created between two domains.
        This helps avoid creating too many relationships that trigger large entity creation.
        """
        # Always create relationships for subdomains
        if target_domain.endswith(f'.{source_domain}') or source_domain.endswith(f'.{target_domain}'):
            return True
        
        # Create relationships for domains that share a common parent (up to 2 levels)
        source_parts = source_domain.split('.')
        target_parts = target_domain.split('.')
        
        # Check if they share the same root domain (last 2 parts)
        if len(source_parts) >= 2 and len(target_parts) >= 2:
            source_root = '.'.join(source_parts[-2:])
            target_root = '.'.join(target_parts[-2:])
            return source_root == target_root
        
        return False

    def _extract_certificate_metadata(self, cert_data: Dict[str, Any]) -> Dict[str, Any]:
        """Extract comprehensive metadata from certificate data."""
        raw_issuer_name = cert_data.get('issuer_name', '')
        parsed_issuer_name = self._parse_issuer_organization(raw_issuer_name)
        
        metadata = {
            'certificate_id': cert_data.get('id'),
            'serial_number': cert_data.get('serial_number'),
            'issuer_name': parsed_issuer_name,
            'issuer_ca_id': cert_data.get('issuer_ca_id'),
            'common_name': cert_data.get('common_name'),
            'not_before': cert_data.get('not_before'),
            'not_after': cert_data.get('not_after'),
            'entry_timestamp': cert_data.get('entry_timestamp'),
            'source': 'crtsh'
        }
        
        try:
            if metadata['not_before'] and metadata['not_after']:
                not_before = self._parse_certificate_date(metadata['not_before'])
                not_after = self._parse_certificate_date(metadata['not_after'])
                
                metadata['validity_period_days'] = (not_after - not_before).days
                metadata['is_currently_valid'] = self._is_cert_valid(cert_data)
                metadata['expires_soon'] = (not_after - datetime.now(timezone.utc)).days <= 30
                
                # Keep raw date format or convert to standard format
                metadata['not_before'] = not_before.isoformat()
                metadata['not_after'] = not_after.isoformat()
                
        except Exception as e:
            self.logger.logger.debug(f"Error computing certificate metadata: {e}")
            metadata['is_currently_valid'] = False
            metadata['expires_soon'] = False
        
        return metadata

    def _parse_issuer_organization(self, issuer_dn: str) -> str:
        """Parse the issuer Distinguished Name to extract just the organization name."""
        if not issuer_dn:
            return issuer_dn
        
        try:
            components = [comp.strip() for comp in issuer_dn.split(',')]
            
            for component in components:
                if component.startswith('O='):
                    org_name = component[2:].strip()
                    if org_name.startswith('"') and org_name.endswith('"'):
                        org_name = org_name[1:-1]
                    return org_name
            
            return issuer_dn
            
        except Exception as e:
            self.logger.logger.debug(f"Failed to parse issuer DN '{issuer_dn}': {e}")
            return issuer_dn

    def _parse_certificate_date(self, date_string: str) -> datetime:
        """Parse certificate date from crt.sh format."""
        if not date_string:
            raise ValueError("Empty date string")

        try:
            if isinstance(date_string, datetime):
                return date_string.replace(tzinfo=timezone.utc)
            if date_string.endswith('Z'):
                return datetime.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc)
            elif '+' in date_string or date_string.endswith('UTC'):
                date_string = date_string.replace('UTC', '').strip()
                if '+' in date_string:
                    date_string = date_string.split('+')[0]
                return datetime.fromisoformat(date_string).replace(tzinfo=timezone.utc)
            else:
                return datetime.fromisoformat(date_string).replace(tzinfo=timezone.utc)
        except Exception as e:
            try:
                return datetime.strptime(date_string[:19], "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc)
            except Exception:
                raise ValueError(f"Unable to parse date: {date_string}") from e

    def _is_cert_valid(self, cert_data: Dict[str, Any]) -> bool:
        """Check if a certificate is currently valid based on its expiry date."""
        try:
            not_after_str = cert_data.get('not_after')
            if not not_after_str:
                return False

            not_after_date = self._parse_certificate_date(not_after_str)
            not_before_str = cert_data.get('not_before')

            now = datetime.now(timezone.utc)
            is_not_expired = not_after_date > now

            if not_before_str:
                not_before_date = self._parse_certificate_date(not_before_str)
                is_not_before_valid = not_before_date <= now
                return is_not_expired and is_not_before_valid

            return is_not_expired

        except Exception as e:
            return False

    def _extract_domains_from_certificate(self, cert_data: Dict[str, Any]) -> Set[str]:
        """Extract all domains from certificate data."""
        domains = set()
        
        # Extract from common name
        common_name = cert_data.get('common_name', '')
        if common_name:
            cleaned_cn = self._clean_domain_name(common_name)
            if cleaned_cn:
                domains.update(cleaned_cn)
        
        # Extract from name_value field (contains SANs)
        name_value = cert_data.get('name_value', '')
        if name_value:
            for line in name_value.split('\n'):
                cleaned_domains = self._clean_domain_name(line.strip())
                if cleaned_domains:
                    domains.update(cleaned_domains)
        
        return domains

    def _clean_domain_name(self, domain_name: str) -> List[str]:
        """Clean and normalize domain name from certificate data."""
        if not domain_name:
            return []

        domain = domain_name.strip().lower()

        if domain.startswith(('http://', 'https://')):
            domain = domain.split('://', 1)[1]

        if '/' in domain:
            domain = domain.split('/', 1)[0]

        if ':' in domain and not domain.count(':') > 1:
            domain = domain.split(':', 1)[0]

        cleaned_domains = []
        if domain.startswith('*.'):
            cleaned_domains.append(domain)
            cleaned_domains.append(domain[2:])
        else:
            cleaned_domains.append(domain)

        final_domains = []
        for d in cleaned_domains:
            d = re.sub(r'[^\w\-\.]', '', d)
            if d and not d.startswith(('.', '-')) and not d.endswith(('.', '-')):
                final_domains.append(d)

        return [d for d in final_domains if _is_valid_domain(d)]

    def _calculate_domain_relationship_confidence(self, domain1: str, domain2: str, 
                                                shared_certificates: List[Dict[str, Any]],
                                                all_discovered_domains: Set[str]) -> float:
        """Calculate confidence score for domain relationship based on various factors."""
        base_confidence = 0.9
        
        # Adjust confidence based on domain relationship context
        relationship_context = self._determine_relationship_context(domain2, domain1)
        
        if relationship_context == 'exact_match':
            context_bonus = 0.0
        elif relationship_context == 'subdomain':
            context_bonus = 0.1
        elif relationship_context == 'parent_domain':
            context_bonus = 0.05
        else:
            context_bonus = 0.0
        
        final_confidence = base_confidence + context_bonus
        return max(0.1, min(1.0, final_confidence))

    def _determine_relationship_context(self, cert_domain: str, query_domain: str) -> str:
        """Determine the context of the relationship between certificate domain and query domain."""
        if cert_domain == query_domain:
            return 'exact_match'
        elif cert_domain.endswith(f'.{query_domain}'):
            return 'subdomain'
        elif query_domain.endswith(f'.{cert_domain}'):
            return 'parent_domain'
        else:
            return 'related_domain'
        
    def _check_for_incomplete_data(self, domain: str, certificates: List[Dict[str, Any]]) -> Optional[str]:
        """
        Analyzes the certificate list to heuristically detect if the data from crt.sh is incomplete.
        """
        cert_count = len(certificates)

        # Heuristic 1: Check if the number of certs hits a known hard limit.
        if cert_count >= 10000:
            return f"Result likely truncated; received {cert_count} certificates, which may be the maximum limit."

        # Heuristic 2: Check if all returned certificates are old.
        if cert_count > 1000: # Only apply this for a reasonable number of certs
            latest_expiry = None
            for cert in certificates:
                try:
                    not_after = self._parse_certificate_date(cert.get('not_after'))
                    if latest_expiry is None or not_after > latest_expiry:
                        latest_expiry = not_after
                except (ValueError, TypeError):
                    continue

            if latest_expiry and (datetime.now(timezone.utc) - latest_expiry).days > 365:
                 return f"Incomplete data suspected: The latest certificate expired more than a year ago ({latest_expiry.strftime('%Y-%m-%d')})."

        return None