142 lines
5.6 KiB
Python
142 lines
5.6 KiB
Python
# File: src/tld_fetcher.py
|
|
"""Fetch and cache IANA TLD list."""
|
|
|
|
import requests
|
|
import logging
|
|
from typing import List, Set, Optional
|
|
import os
|
|
import time
|
|
|
|
# Module logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class TLDFetcher:
|
|
"""Fetches and caches IANA TLD list."""
|
|
|
|
IANA_TLD_URL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
|
CACHE_FILE = "tlds_cache.txt"
|
|
CACHE_DURATION = 86400 # 24 hours in seconds
|
|
|
|
def __init__(self):
|
|
self._tlds: Optional[Set[str]] = None
|
|
logger.info("🌐 TLD fetcher initialized")
|
|
|
|
def get_tlds(self) -> Set[str]:
|
|
"""Get list of TLDs, using cache if available."""
|
|
if self._tlds is None:
|
|
logger.debug("🔍 Loading TLD list...")
|
|
self._tlds = self._load_tlds()
|
|
logger.info(f"✅ Loaded {len(self._tlds)} TLDs")
|
|
return self._tlds
|
|
|
|
def _load_tlds(self) -> Set[str]:
|
|
"""Load TLDs from cache or fetch from IANA."""
|
|
if self._is_cache_valid():
|
|
logger.debug("📂 Loading TLDs from cache")
|
|
return self._load_from_cache()
|
|
else:
|
|
logger.info("🌐 Fetching fresh TLD list from IANA")
|
|
return self._fetch_and_cache()
|
|
|
|
def _is_cache_valid(self) -> bool:
|
|
"""Check if cache file exists and is recent."""
|
|
if not os.path.exists(self.CACHE_FILE):
|
|
logger.debug("❌ TLD cache file does not exist")
|
|
return False
|
|
|
|
cache_age = time.time() - os.path.getmtime(self.CACHE_FILE)
|
|
is_valid = cache_age < self.CACHE_DURATION
|
|
|
|
if is_valid:
|
|
logger.debug(f"✅ TLD cache is valid (age: {cache_age/3600:.1f} hours)")
|
|
else:
|
|
logger.debug(f"❌ TLD cache is expired (age: {cache_age/3600:.1f} hours)")
|
|
|
|
return is_valid
|
|
|
|
def _load_from_cache(self) -> Set[str]:
|
|
"""Load TLDs from cache file."""
|
|
try:
|
|
with open(self.CACHE_FILE, 'r', encoding='utf-8') as f:
|
|
tlds = set()
|
|
for line in f:
|
|
line = line.strip().lower()
|
|
if line and not line.startswith('#'):
|
|
tlds.add(line)
|
|
|
|
logger.info(f"📂 Loaded {len(tlds)} TLDs from cache")
|
|
return tlds
|
|
except Exception as e:
|
|
logger.error(f"❌ Error loading TLD cache: {e}")
|
|
# Fall back to fetching fresh data
|
|
return self._fetch_and_cache()
|
|
|
|
def _fetch_and_cache(self) -> Set[str]:
|
|
"""Fetch TLDs from IANA and cache them."""
|
|
try:
|
|
logger.info(f"📡 Fetching TLD list from: {self.IANA_TLD_URL}")
|
|
|
|
response = requests.get(
|
|
self.IANA_TLD_URL,
|
|
timeout=30,
|
|
headers={'User-Agent': 'DNS-Recon-Tool/1.0'}
|
|
)
|
|
response.raise_for_status()
|
|
|
|
tlds = set()
|
|
lines_processed = 0
|
|
|
|
for line in response.text.split('\n'):
|
|
line = line.strip().lower()
|
|
if line and not line.startswith('#'):
|
|
tlds.add(line)
|
|
lines_processed += 1
|
|
|
|
logger.info(f"✅ Fetched {len(tlds)} TLDs from IANA (processed {lines_processed} lines)")
|
|
|
|
# Cache the results
|
|
try:
|
|
with open(self.CACHE_FILE, 'w', encoding='utf-8') as f:
|
|
f.write(response.text)
|
|
logger.info(f"💾 TLD list cached to {self.CACHE_FILE}")
|
|
except Exception as cache_error:
|
|
logger.warning(f"⚠️ Could not cache TLD list: {cache_error}")
|
|
|
|
return tlds
|
|
|
|
except requests.exceptions.Timeout:
|
|
logger.error("⏱️ Timeout fetching TLD list from IANA")
|
|
return self._get_fallback_tlds()
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"🌐 Network error fetching TLD list: {e}")
|
|
return self._get_fallback_tlds()
|
|
except Exception as e:
|
|
logger.error(f"❌ Unexpected error fetching TLD list: {e}")
|
|
return self._get_fallback_tlds()
|
|
|
|
def _get_fallback_tlds(self) -> Set[str]:
|
|
"""Return a minimal set of common TLDs if fetch fails."""
|
|
logger.warning("⚠️ Using fallback TLD list")
|
|
|
|
fallback_tlds = {
|
|
# Generic top-level domains
|
|
'com', 'org', 'net', 'edu', 'gov', 'mil', 'int', 'info', 'biz', 'name',
|
|
|
|
# Country code top-level domains (major ones)
|
|
'us', 'uk', 'de', 'fr', 'it', 'es', 'nl', 'be', 'ch', 'at', 'se', 'no',
|
|
'dk', 'fi', 'pl', 'cz', 'hu', 'ro', 'bg', 'hr', 'si', 'sk', 'lt', 'lv',
|
|
'ee', 'ie', 'pt', 'gr', 'cy', 'mt', 'lu', 'is', 'li', 'ad', 'mc', 'sm',
|
|
'va', 'by', 'ua', 'md', 'ru', 'kz', 'kg', 'tj', 'tm', 'uz', 'am', 'az',
|
|
'ge', 'tr', 'il', 'jo', 'lb', 'sy', 'iq', 'ir', 'af', 'pk', 'in', 'lk',
|
|
'mv', 'bt', 'bd', 'np', 'mm', 'th', 'la', 'kh', 'vn', 'my', 'sg', 'bn',
|
|
'id', 'tl', 'ph', 'tw', 'hk', 'mo', 'cn', 'kp', 'kr', 'jp', 'mn',
|
|
|
|
# Common compound TLDs
|
|
'co.uk', 'org.uk', 'ac.uk', 'gov.uk', 'com.au', 'org.au', 'net.au',
|
|
'gov.au', 'edu.au', 'co.za', 'org.za', 'net.za', 'gov.za', 'ac.za',
|
|
'co.nz', 'org.nz', 'net.nz', 'govt.nz', 'ac.nz', 'co.jp', 'or.jp',
|
|
'ne.jp', 'go.jp', 'ac.jp', 'ad.jp', 'ed.jp', 'gr.jp', 'lg.jp'
|
|
}
|
|
|
|
logger.info(f"📋 Using {len(fallback_tlds)} fallback TLDs")
|
|
return fallback_tlds |