# File: src/tld_fetcher.py """Fetch and cache IANA TLD list.""" import requests import logging from typing import List, Set, Optional import os import time # Module logger logger = logging.getLogger(__name__) class TLDFetcher: """Fetches and caches IANA TLD list.""" IANA_TLD_URL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" CACHE_FILE = "tlds_cache.txt" CACHE_DURATION = 86400 # 24 hours in seconds def __init__(self): self._tlds: Optional[Set[str]] = None logger.info("🌐 TLD fetcher initialized") def get_tlds(self) -> Set[str]: """Get list of TLDs, using cache if available.""" if self._tlds is None: logger.debug("🔍 Loading TLD list...") self._tlds = self._load_tlds() logger.info(f"✅ Loaded {len(self._tlds)} TLDs") return self._tlds def _load_tlds(self) -> Set[str]: """Load TLDs from cache or fetch from IANA.""" if self._is_cache_valid(): logger.debug("📂 Loading TLDs from cache") return self._load_from_cache() else: logger.info("🌐 Fetching fresh TLD list from IANA") return self._fetch_and_cache() def _is_cache_valid(self) -> bool: """Check if cache file exists and is recent.""" if not os.path.exists(self.CACHE_FILE): logger.debug("❌ TLD cache file does not exist") return False cache_age = time.time() - os.path.getmtime(self.CACHE_FILE) is_valid = cache_age < self.CACHE_DURATION if is_valid: logger.debug(f"✅ TLD cache is valid (age: {cache_age/3600:.1f} hours)") else: logger.debug(f"❌ TLD cache is expired (age: {cache_age/3600:.1f} hours)") return is_valid def _load_from_cache(self) -> Set[str]: """Load TLDs from cache file.""" try: with open(self.CACHE_FILE, 'r', encoding='utf-8') as f: tlds = set() for line in f: line = line.strip().lower() if line and not line.startswith('#'): tlds.add(line) logger.info(f"📂 Loaded {len(tlds)} TLDs from cache") return tlds except Exception as e: logger.error(f"❌ Error loading TLD cache: {e}") # Fall back to fetching fresh data return self._fetch_and_cache() def _fetch_and_cache(self) -> Set[str]: """Fetch TLDs from IANA and cache them.""" try: logger.info(f"📡 Fetching TLD list from: {self.IANA_TLD_URL}") response = requests.get( self.IANA_TLD_URL, timeout=30, headers={'User-Agent': 'DNS-Recon-Tool/1.0'} ) response.raise_for_status() tlds = set() lines_processed = 0 for line in response.text.split('\n'): line = line.strip().lower() if line and not line.startswith('#'): tlds.add(line) lines_processed += 1 logger.info(f"✅ Fetched {len(tlds)} TLDs from IANA (processed {lines_processed} lines)") # Cache the results try: with open(self.CACHE_FILE, 'w', encoding='utf-8') as f: f.write(response.text) logger.info(f"💾 TLD list cached to {self.CACHE_FILE}") except Exception as cache_error: logger.warning(f"⚠️ Could not cache TLD list: {cache_error}") return tlds except requests.exceptions.Timeout: logger.error("⏱️ Timeout fetching TLD list from IANA") return self._get_fallback_tlds() except requests.exceptions.RequestException as e: logger.error(f"🌐 Network error fetching TLD list: {e}") return self._get_fallback_tlds() except Exception as e: logger.error(f"❌ Unexpected error fetching TLD list: {e}") return self._get_fallback_tlds() def _get_fallback_tlds(self) -> Set[str]: """Return a minimal set of common TLDs if fetch fails.""" logger.warning("⚠️ Using fallback TLD list") fallback_tlds = { # Generic top-level domains 'com', 'org', 'net', 'edu', 'gov', 'mil', 'int', 'info', 'biz', 'name', # Country code top-level domains (major ones) 'us', 'uk', 'de', 'fr', 'it', 'es', 'nl', 'be', 'ch', 'at', 'se', 'no', 'dk', 'fi', 'pl', 'cz', 'hu', 'ro', 'bg', 'hr', 'si', 'sk', 'lt', 'lv', 'ee', 'ie', 'pt', 'gr', 'cy', 'mt', 'lu', 'is', 'li', 'ad', 'mc', 'sm', 'va', 'by', 'ua', 'md', 'ru', 'kz', 'kg', 'tj', 'tm', 'uz', 'am', 'az', 'ge', 'tr', 'il', 'jo', 'lb', 'sy', 'iq', 'ir', 'af', 'pk', 'in', 'lk', 'mv', 'bt', 'bd', 'np', 'mm', 'th', 'la', 'kh', 'vn', 'my', 'sg', 'bn', 'id', 'tl', 'ph', 'tw', 'hk', 'mo', 'cn', 'kp', 'kr', 'jp', 'mn', # Common compound TLDs 'co.uk', 'org.uk', 'ac.uk', 'gov.uk', 'com.au', 'org.au', 'net.au', 'gov.au', 'edu.au', 'co.za', 'org.za', 'net.za', 'gov.za', 'ac.za', 'co.nz', 'org.nz', 'net.nz', 'govt.nz', 'ac.nz', 'co.jp', 'or.jp', 'ne.jp', 'go.jp', 'ac.jp', 'ad.jp', 'ed.jp', 'gr.jp', 'lg.jp' } logger.info(f"📋 Using {len(fallback_tlds)} fallback TLDs") return fallback_tlds