2025-09-03 13:20:23 +02:00

22 lines
544 B
Python

import tldextract
import re
def parse(text):
# Regular expression for detecting potential URLs
url_regex = r'\b(?:https?|ftp):\/\/[^\s]+'
matches = []
for url_match in re.finditer(url_regex, text):
full_url = url_match.group()
# Use tldextract to validate the domain and suffix
extracted = tldextract.extract(full_url)
if extracted.domain and extracted.suffix:
start_pos, end_pos = url_match.span()
matches.append((full_url, start_pos, end_pos))
return matches