import phonenumbers import logging import re def parse(text, default_regions = [ 'US', 'GB', 'DE', 'FR', 'ES', 'IT', 'RU', 'CN', 'IN', 'JP', 'BR', 'ZA', 'NG', 'EG', 'TR', 'ID', 'AU', 'CA', 'MX', 'AR', 'KR', 'TH', 'VN', 'PH', 'MY', 'SA', 'IR', 'PK', 'BD', 'UA', 'PL', 'NL', 'BE', 'CH', 'AT', 'SE', 'NO', 'DK', 'FI', 'IL', 'SG', 'HK', 'NZ', 'AE', 'KE', 'CO', 'VE', 'PE', 'CL', 'GR', 'PT', 'CZ', 'RO', 'HU', 'BG', 'SK', 'SI', 'HR', 'RS', 'LT', 'LV', 'EE', 'CY', 'LU', 'MT', 'IS', 'KZ', 'UZ', 'AM', 'AZ', 'GE', 'MN', 'KG', 'TJ', 'TM', 'BT', 'NP', 'LK', 'MM', 'KH', 'LA', 'BN', 'FJ', 'PW', 'SB', 'VU', 'FM', 'WS', 'TO', 'TV', 'KI', 'NR', 'MQ', 'GF', 'RE', 'YT', 'PF', 'NC', 'WF', 'TF', 'AI', 'AG', 'AW', 'BS', 'BB', 'BZ', 'BM', 'VG', 'KY', 'CU', 'CW', 'DM', 'DO', 'GD', 'GP', 'HT', 'JM', 'MQ', 'MS', 'PR', 'KN', 'LC', 'VC', 'SX', 'TT', 'TC', 'VI', 'BO', 'BQ', 'EC', 'GY', 'PY', 'SR', 'UY', 'DZ', 'AO', 'BJ', 'BW', 'BF', 'BI', 'CV', 'CM', 'CF', 'TD', 'KM', 'CG', 'CD', 'DJ', 'GQ', 'ER', 'SZ', 'ET', 'GA', 'GM', 'GH', 'GN', 'GW', 'CI', 'LS', 'LR', 'LY', 'MG', 'MW', 'ML', 'MR', 'MU', 'MA', 'MZ', 'NA', 'NE', 'NG', 'RW', 'ST', 'SN', 'SC', 'SL', 'SO', 'SS', 'SD', 'TZ', 'TG', 'TN', 'UG', 'ZM', 'ZW' ] ): matches = [] timestamp_patterns = [ (r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'), # ISO 8601 Extended (r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'), # ISO 8601 with slashes (r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'), # European Date Format (r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'), # US Date Format (r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'), # Compact Format (r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'), # ISO 8601 Basic (r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format (r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'), # Basic Format without Separators (r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name (r'(?:19|20)\d{10}', '%Y%m%d%H%M'), # Compact Numeric Format # Add more patterns as needed ] unlikely_phone_patterns = [ r'\d{5,}\s?bytes', # File size in bytes r'https?://\S+', # URLs r'\bversion \d+', # 'version' followed by numbers r'cve-\d{4}-\d+', # CVE identifiers r'\S+\.onion\S*', # Onion addresses r'Product ID: \S+', # Product IDs r'\|\s*[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\s*\|', # UUIDs r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', # IP addresses r'Mem: \d+\s+\d+\s+\d+', # Memory sizes r'Total: \d+\s+\d+\s+\d+', # Total memory r'block_size=\d+', # Block size r'-rw-------\s+\d+\s+\S+\s+\S+\s+\d+\s+\S+\s+\d{1,2}\s+\d{1,2}:\d{2}', # File details r'\d+\.\d+\.\d+\.\d+\s+\d+\s+\S+\s+\d+', # IP and port patterns # Add more patterns as needed ] # More specific regex for phone numbers def is_unlikely_phone_context(extended_text): # Check against timestamp patterns for pattern, _ in timestamp_patterns: if re.search(pattern, extended_text): return True # Check against other unlikely phone patterns for pattern in unlikely_phone_patterns: if re.search(pattern, extended_text): return True return False # More specific regex for phone numbers phone_regex = r'\b(\+?\d{1,3}[\s-]?)?(\(?\d{1,4}\)?[\s-]?)?\d{3,5}[\s-]?\d{3,5}\b' for number_match in re.finditer(phone_regex, text): raw_number = number_match.group() start_pos, end_pos = number_match.span() # Extend the search window for additional context extended_start = max(0, start_pos - 50) extended_end = min(len(text), end_pos + 50) extended_text = text[extended_start:extended_end] if is_unlikely_phone_context(extended_text): continue # Skip if the context indicates it's not a phone number valid_number_found = False for region in default_regions: try: parsed_number = phonenumbers.parse(raw_number, region) if phonenumbers.is_valid_number(parsed_number): matches.append((raw_number, start_pos, end_pos)) valid_number_found = True break except phonenumbers.NumberParseException: continue if not valid_number_found: logging.debug(f"Failed to parse number: {raw_number}") return matches