22 lines
544 B
Python
22 lines
544 B
Python
import tldextract
|
|
import re
|
|
|
|
def parse(text):
|
|
# Regular expression for detecting potential URLs
|
|
url_regex = r'\b(?:https?|ftp):\/\/[^\s]+'
|
|
matches = []
|
|
|
|
for url_match in re.finditer(url_regex, text):
|
|
full_url = url_match.group()
|
|
|
|
# Use tldextract to validate the domain and suffix
|
|
extracted = tldextract.extract(full_url)
|
|
|
|
if extracted.domain and extracted.suffix:
|
|
start_pos, end_pos = url_match.span()
|
|
matches.append((full_url, start_pos, end_pos))
|
|
|
|
return matches
|
|
|
|
|