initial commit
This commit is contained in:
226
data/entities.yaml
Normal file
226
data/entities.yaml
Normal file
@@ -0,0 +1,226 @@
|
||||
bip39:
|
||||
entity_type: bip39
|
||||
gui_name: BIP39 Wordlist
|
||||
gui_tooltip: Outputs BIP39 wordlists, which is parsed from the text by the required
|
||||
length, with 0-5 characters in between the words.
|
||||
parent_type: category_cryptocurrency
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: bip39.py
|
||||
btcaddr:
|
||||
entity_type: btcaddr
|
||||
gui_name: Bitcoin Address
|
||||
gui_tooltip: Outputs BTC addresses of the common formats P2PKH, P2SH and Bech32.
|
||||
parent_type: category_bitcoin
|
||||
parser_enabled: true
|
||||
regex_pattern: \b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b
|
||||
script_parser: btcaddr.py
|
||||
btctxid:
|
||||
entity_type: btctxid
|
||||
gui_name: Bitcoin TXID
|
||||
gui_tooltip: Outputs BTC TXIDs.
|
||||
parent_type: category_bitcoin
|
||||
parser_enabled: true
|
||||
regex_pattern: \b[a-fA-F0-9]{64}\b
|
||||
script_parser: null
|
||||
category_bitcoin:
|
||||
entity_type: category_bitcoin
|
||||
gui_name: Bitcoin
|
||||
gui_tooltip: Bitcoin related entities.
|
||||
parent_type: category_cryptocurrency
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
category_communication:
|
||||
entity_type: category_communication
|
||||
gui_name: Communication
|
||||
gui_tooltip: Communication related entities.
|
||||
parent_type: root
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
category_cryptocurrency:
|
||||
entity_type: category_cryptocurrency
|
||||
gui_name: Cryptocurrency
|
||||
gui_tooltip: Cryptocurrency related entities.
|
||||
parent_type: root
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
category_cybersecurity:
|
||||
entity_type: category_cybersecurity
|
||||
gui_name: Cybersecurity
|
||||
gui_tooltip: Cybersecurity related entities.
|
||||
parent_type: root
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
category_internet:
|
||||
entity_type: category_internet
|
||||
gui_name: Internet
|
||||
gui_tooltip: Internet related entities.
|
||||
parent_type: root
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
category_monero:
|
||||
entity_type: category_monero
|
||||
gui_name: Monero
|
||||
gui_tooltip: Monero related entities.
|
||||
parent_type: category_cryptocurrency
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
category_networking:
|
||||
entity_type: category_networking
|
||||
gui_name: Networking
|
||||
gui_tooltip: Networking related entities.
|
||||
parent_type: root
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
category_special:
|
||||
entity_type: category_special
|
||||
gui_name: Special Parsers
|
||||
gui_tooltip: Special parsers, e.g. created wordlists.
|
||||
parent_type: root
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
gdocurl:
|
||||
entity_type: gdocurl
|
||||
gui_name: Google Docs URL
|
||||
gui_tooltip: Outputs any possible Google Docs URLs.
|
||||
parent_type: url
|
||||
parser_enabled: true
|
||||
regex_pattern: \bhttps:\/\/docs\.google\.com\/[\w\/.-]*\/d\/[a-zA-Z0-9_-]+(?:\/\S*)?
|
||||
script_parser: null
|
||||
generated_wordlist_match:
|
||||
entity_type: generated_wordlist_match
|
||||
gui_name: Generated Wordlist Match
|
||||
gui_tooltip: Outputs any wordlist matches which are specified by the generated wordlist
|
||||
present in the parser directory.
|
||||
parent_type: category_special
|
||||
parser_enabled: true
|
||||
regex_pattern: null
|
||||
script_parser: generated_wordlist.py
|
||||
github:
|
||||
entity_type: github
|
||||
gui_name: GitHub
|
||||
gui_tooltip: Outputs any possible GitHub repositories.
|
||||
parent_type: url
|
||||
parser_enabled: true
|
||||
regex_pattern: \bhttps?:\/\/github\.com\/[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+\/?\S*
|
||||
script_parser: null
|
||||
ipv4:
|
||||
entity_type: ipv4
|
||||
gui_name: IPv4 Address
|
||||
gui_tooltip: Outputs any IPv4 addresses.
|
||||
parent_type: category_networking
|
||||
parser_enabled: true
|
||||
regex_pattern: \b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b
|
||||
script_parser: ipv4.py
|
||||
ipv4pr:
|
||||
entity_type: ipv4pr
|
||||
gui_name: Private Address Range
|
||||
gui_tooltip: Outputs any IPv4 addresses of the private address range.
|
||||
parent_type: ipv4
|
||||
parser_enabled: true
|
||||
regex_pattern: \b(10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(1[6-9]|2[0-9]|3[0-1])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3})\b
|
||||
script_parser: ipv4pr.py
|
||||
ipv4pu:
|
||||
entity_type: ipv4pu
|
||||
gui_name: Public Address Range
|
||||
gui_tooltip: Outputs any IPv4 addresses of the public address range.
|
||||
parent_type: ipv4
|
||||
parser_enabled: true
|
||||
regex_pattern: \b((?!10\.)(?!172\.(1[6-9]|2[0-9]|3[0-1]))(?!192\.168)(?:[0-9]{1,3}\.){3}[0-9]{1,3})\b
|
||||
script_parser: ipv4pu.py
|
||||
ipv6:
|
||||
entity_type: ipv6
|
||||
gui_name: IPv6 Address
|
||||
gui_tooltip: Outputs any IPv6 addresses.
|
||||
parent_type: category_networking
|
||||
parser_enabled: true
|
||||
regex_pattern: (([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))
|
||||
script_parser: ipv6.py
|
||||
macaddr:
|
||||
entity_type: macaddr
|
||||
gui_name: MAC Address
|
||||
gui_tooltip: Outputs any possible MAC addresses.
|
||||
parent_type: category_networking
|
||||
parser_enabled: true
|
||||
regex_pattern: \b(?:[0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}\b
|
||||
script_parser: null
|
||||
mailaddr:
|
||||
entity_type: mailaddr
|
||||
gui_name: EMail Address
|
||||
gui_tooltip: Outputs any possible email-addresses.
|
||||
parent_type: category_communication
|
||||
parser_enabled: true
|
||||
regex_pattern: \b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b
|
||||
script_parser: null
|
||||
onionurl:
|
||||
entity_type: onionurl
|
||||
gui_name: Onion URL
|
||||
gui_tooltip: Outputs any possible onion URL.
|
||||
parent_type: category_internet
|
||||
parser_enabled: true
|
||||
regex_pattern: \bhttps?:\/\/[a-z2-7]{16,56}\.onion(?:\/\S*)?
|
||||
script_parser: null
|
||||
telnum:
|
||||
entity_type: telnum
|
||||
gui_name: Possible Telephone Number
|
||||
gui_tooltip: Outputs any possible telephone numbers, this may have some 0-positives.
|
||||
parent_type: category_communication
|
||||
parser_enabled: true
|
||||
regex_pattern: \b(?:\+\d{1,4}\s?)?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b
|
||||
script_parser: telnum.py
|
||||
toxid:
|
||||
entity_type: toxid
|
||||
gui_name: Tox ID
|
||||
gui_tooltip: Outputs any possible tox ID, including QTOX. Unverified Regex Pattern.
|
||||
parent_type: category_communication
|
||||
parser_enabled: true
|
||||
regex_pattern: (?<![0-9a-fA-F])[0-9a-fA-F]{76}(?![0-9a-fA-F])
|
||||
script_parser: null
|
||||
url:
|
||||
entity_type: url
|
||||
gui_name: URL
|
||||
gui_tooltip: Outputs any possible URL.
|
||||
parent_type: category_internet
|
||||
parser_enabled: true
|
||||
regex_pattern: \b(?:https?|s?ftp):\/\/[\w\/.-]+(?:\.[a-z]{2,})+\S*
|
||||
script_parser: url.py
|
||||
vulnerability_CVE:
|
||||
entity_type: vulnerability_CVE
|
||||
gui_name: CVE String
|
||||
gui_tooltip: Outputs any possible CVE Vulnerability Identifier.
|
||||
parent_type: category_cybersecurity
|
||||
parser_enabled: true
|
||||
regex_pattern: cve-\d{4}-\d+
|
||||
script_parser: null
|
||||
xmraddr:
|
||||
entity_type: xmraddr
|
||||
gui_name: Monero Address
|
||||
gui_tooltip: Outputs Monero addresses.
|
||||
parent_type: category_monero
|
||||
parser_enabled: true
|
||||
regex_pattern: \b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b
|
||||
script_parser: xmraddr.py
|
||||
category_metadata:
|
||||
entity_type: category_metadata
|
||||
gui_name: Metadata
|
||||
gui_tooltip: Metadata related entities.
|
||||
parent_type: root
|
||||
parser_enabled: false
|
||||
regex_pattern: null
|
||||
script_parser: null
|
||||
timestamp:
|
||||
entity_type: timestamp
|
||||
gui_name: Timestamp
|
||||
gui_tooltip: Timestamp-like entities.
|
||||
parent_type: category_metadata
|
||||
parser_enabled: false
|
||||
regex_pattern: \b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z\b
|
||||
script_parser: timestamp.py
|
||||
0
data/parser/__init__.py
Normal file
0
data/parser/__init__.py
Normal file
26
data/parser/bip39 copy.py
Normal file
26
data/parser/bip39 copy.py
Normal file
File diff suppressed because one or more lines are too long
28
data/parser/bip39.py
Normal file
28
data/parser/bip39.py
Normal file
File diff suppressed because one or more lines are too long
24
data/parser/btcaddr.py
Normal file
24
data/parser/btcaddr.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import re
|
||||
|
||||
def parse(text):
|
||||
# Regular expressions for different Bitcoin address formats
|
||||
p2pkh_regex = r'\b1[1-9A-HJ-NP-Za-km-z]{25,34}\b'
|
||||
p2sh_regex = r'\b3[1-9A-HJ-NP-Za-km-z]{25,34}\b'
|
||||
bech32_regex = r'\bbc1[q,p,z][0-9a-z]{39,59}\b'
|
||||
bech32_regex1 = r'\bbc1[qpz0-9ac-hj-np-z]{38,58}\b'
|
||||
less_common_regex = r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b'
|
||||
|
||||
# Combine all regexes
|
||||
combined_regex = f'({p2pkh_regex})|({p2sh_regex})|({bech32_regex})|({less_common_regex}) | ({bech32_regex1})'
|
||||
|
||||
matches = []
|
||||
for match in re.finditer(combined_regex, text):
|
||||
for addr in match.groups():
|
||||
if addr: # Check if the captured group is not None
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((addr, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
|
||||
# integrate regexes xpub, ypub, zpub
|
||||
# checksumme check
|
||||
22
data/parser/generated_wordlist.py
Normal file
22
data/parser/generated_wordlist.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import re
|
||||
import os
|
||||
|
||||
def load_wordlist(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
return [line.strip() for line in file]
|
||||
|
||||
def parse(text):
|
||||
wordlist_path = os.path.join(os.path.dirname(__file__), 'generated_wordlist.txt')
|
||||
wordlist = load_wordlist(wordlist_path)
|
||||
|
||||
# Create a regex pattern that matches any word in the wordlist
|
||||
pattern = '(' + '|'.join(re.escape(word).replace(' ', r'\s+') for word in wordlist) + ')'
|
||||
|
||||
matches = []
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
matched_word = match.group()
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((matched_word, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
|
||||
22
data/parser/ipv4.py
Normal file
22
data/parser/ipv4.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import re
|
||||
import ipaddress
|
||||
|
||||
def is_valid_ipv4_address(ip_addr):
|
||||
try:
|
||||
# This will return True for both public and private IPv4 addresses
|
||||
return isinstance(ipaddress.ip_address(ip_addr), ipaddress.IPv4Address)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def parse(text):
|
||||
ipv4_regex = r'(?<!\d)(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?!\d)'
|
||||
matches = []
|
||||
|
||||
for match in re.finditer(ipv4_regex, text):
|
||||
ip_addr = match.group()
|
||||
if is_valid_ipv4_address(ip_addr):
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((ip_addr, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
|
||||
22
data/parser/ipv4pr.py
Normal file
22
data/parser/ipv4pr.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import re
|
||||
import ipaddress
|
||||
|
||||
def is_private_ip(ip_addr):
|
||||
try:
|
||||
return ipaddress.ip_address(ip_addr).is_private
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def parse(text):
|
||||
ipv4_regex = r'\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
|
||||
matches = []
|
||||
|
||||
for match in re.finditer(ipv4_regex, text):
|
||||
ip_addr = match.group()
|
||||
if is_private_ip(ip_addr):
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((ip_addr, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
21
data/parser/ipv4pu.py
Normal file
21
data/parser/ipv4pu.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import re
|
||||
import ipaddress
|
||||
|
||||
def is_public_ip(ip_addr):
|
||||
try:
|
||||
ip_obj = ipaddress.ip_address(ip_addr)
|
||||
return not ip_obj.is_private and not ip_obj.is_reserved and not ip_obj.is_loopback
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def parse(text):
|
||||
ipv4_regex = r'\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
|
||||
matches = []
|
||||
|
||||
for match in re.finditer(ipv4_regex, text):
|
||||
ip_addr = match.group()
|
||||
if is_public_ip(ip_addr):
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((ip_addr, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
20
data/parser/ipv6.py
Normal file
20
data/parser/ipv6.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import re
|
||||
import ipaddress
|
||||
|
||||
def is_valid_ipv6_address(ip_addr):
|
||||
try:
|
||||
return isinstance(ipaddress.ip_address(ip_addr), ipaddress.IPv6Address)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def parse(text):
|
||||
ipv6_regex = r'(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))'
|
||||
matches = []
|
||||
|
||||
for match in re.finditer(ipv6_regex, text, re.IGNORECASE):
|
||||
ip_addr = match.group()
|
||||
if is_valid_ipv6_address(ip_addr):
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((ip_addr, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
100
data/parser/telnum.py
Normal file
100
data/parser/telnum.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import phonenumbers
|
||||
import logging
|
||||
import re
|
||||
|
||||
|
||||
def parse(text, default_regions = [
|
||||
'US', 'GB', 'DE', 'FR', 'ES', 'IT', 'RU', 'CN', 'IN', 'JP',
|
||||
'BR', 'ZA', 'NG', 'EG', 'TR', 'ID', 'AU', 'CA', 'MX', 'AR',
|
||||
'KR', 'TH', 'VN', 'PH', 'MY', 'SA', 'IR', 'PK', 'BD', 'UA',
|
||||
'PL', 'NL', 'BE', 'CH', 'AT', 'SE', 'NO', 'DK', 'FI', 'IL',
|
||||
'SG', 'HK', 'NZ', 'AE', 'KE', 'CO', 'VE', 'PE', 'CL', 'GR',
|
||||
'PT', 'CZ', 'RO', 'HU', 'BG', 'SK', 'SI', 'HR', 'RS', 'LT',
|
||||
'LV', 'EE', 'CY', 'LU', 'MT', 'IS', 'KZ', 'UZ', 'AM', 'AZ',
|
||||
'GE', 'MN', 'KG', 'TJ', 'TM', 'BT', 'NP', 'LK', 'MM', 'KH',
|
||||
'LA', 'BN', 'FJ', 'PW', 'SB', 'VU', 'FM', 'WS', 'TO', 'TV',
|
||||
'KI', 'NR', 'MQ', 'GF', 'RE', 'YT', 'PF', 'NC', 'WF', 'TF',
|
||||
'AI', 'AG', 'AW', 'BS', 'BB', 'BZ', 'BM', 'VG', 'KY', 'CU',
|
||||
'CW', 'DM', 'DO', 'GD', 'GP', 'HT', 'JM', 'MQ', 'MS', 'PR',
|
||||
'KN', 'LC', 'VC', 'SX', 'TT', 'TC', 'VI', 'BO', 'BQ', 'EC',
|
||||
'GY', 'PY', 'SR', 'UY', 'DZ', 'AO', 'BJ', 'BW', 'BF', 'BI',
|
||||
'CV', 'CM', 'CF', 'TD', 'KM', 'CG', 'CD', 'DJ', 'GQ', 'ER',
|
||||
'SZ', 'ET', 'GA', 'GM', 'GH', 'GN', 'GW', 'CI', 'LS', 'LR',
|
||||
'LY', 'MG', 'MW', 'ML', 'MR', 'MU', 'MA', 'MZ', 'NA', 'NE',
|
||||
'NG', 'RW', 'ST', 'SN', 'SC', 'SL', 'SO', 'SS', 'SD', 'TZ',
|
||||
'TG', 'TN', 'UG', 'ZM', 'ZW'
|
||||
]
|
||||
):
|
||||
matches = []
|
||||
timestamp_patterns = [
|
||||
(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'), # ISO 8601 Extended
|
||||
(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'), # ISO 8601 with slashes
|
||||
(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'), # European Date Format
|
||||
(r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'), # US Date Format
|
||||
(r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'), # Compact Format
|
||||
(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'), # ISO 8601 Basic
|
||||
(r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format
|
||||
(r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'), # Basic Format without Separators
|
||||
(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name
|
||||
(r'(?:19|20)\d{10}', '%Y%m%d%H%M'), # Compact Numeric Format
|
||||
# Add more patterns as needed
|
||||
]
|
||||
unlikely_phone_patterns = [
|
||||
r'\d{5,}\s?bytes', # File size in bytes
|
||||
r'https?://\S+', # URLs
|
||||
r'\bversion \d+', # 'version' followed by numbers
|
||||
r'cve-\d{4}-\d+', # CVE identifiers
|
||||
r'\S+\.onion\S*', # Onion addresses
|
||||
r'Product ID: \S+', # Product IDs
|
||||
r'\|\s*[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\s*\|', # UUIDs
|
||||
r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', # IP addresses
|
||||
r'Mem: \d+\s+\d+\s+\d+', # Memory sizes
|
||||
r'Total: \d+\s+\d+\s+\d+', # Total memory
|
||||
r'block_size=\d+', # Block size
|
||||
r'-rw-------\s+\d+\s+\S+\s+\S+\s+\d+\s+\S+\s+\d{1,2}\s+\d{1,2}:\d{2}', # File details
|
||||
r'\d+\.\d+\.\d+\.\d+\s+\d+\s+\S+\s+\d+', # IP and port patterns
|
||||
# Add more patterns as needed
|
||||
]
|
||||
# More specific regex for phone numbers
|
||||
def is_unlikely_phone_context(extended_text):
|
||||
# Check against timestamp patterns
|
||||
for pattern, _ in timestamp_patterns:
|
||||
if re.search(pattern, extended_text):
|
||||
return True
|
||||
# Check against other unlikely phone patterns
|
||||
for pattern in unlikely_phone_patterns:
|
||||
if re.search(pattern, extended_text):
|
||||
return True
|
||||
return False
|
||||
|
||||
# More specific regex for phone numbers
|
||||
phone_regex = r'\b(\+?\d{1,3}[\s-]?)?(\(?\d{1,4}\)?[\s-]?)?\d{3,5}[\s-]?\d{3,5}\b'
|
||||
|
||||
for number_match in re.finditer(phone_regex, text):
|
||||
raw_number = number_match.group()
|
||||
start_pos, end_pos = number_match.span()
|
||||
|
||||
# Extend the search window for additional context
|
||||
extended_start = max(0, start_pos - 50)
|
||||
extended_end = min(len(text), end_pos + 50)
|
||||
extended_text = text[extended_start:extended_end]
|
||||
|
||||
if is_unlikely_phone_context(extended_text):
|
||||
continue # Skip if the context indicates it's not a phone number
|
||||
|
||||
valid_number_found = False
|
||||
|
||||
for region in default_regions:
|
||||
try:
|
||||
parsed_number = phonenumbers.parse(raw_number, region)
|
||||
if phonenumbers.is_valid_number(parsed_number):
|
||||
matches.append((raw_number, start_pos, end_pos))
|
||||
valid_number_found = True
|
||||
break
|
||||
except phonenumbers.NumberParseException:
|
||||
continue
|
||||
|
||||
if not valid_number_found:
|
||||
logging.debug(f"Failed to parse number: {raw_number}")
|
||||
|
||||
return matches
|
||||
26
data/parser/timestamp.py
Normal file
26
data/parser/timestamp.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import re
|
||||
|
||||
# List of timestamp patterns
|
||||
timestamp_patterns = [
|
||||
(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'), # ISO 8601 Extended
|
||||
(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'), # ISO 8601 with slashes
|
||||
(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'), # European Date Format
|
||||
(r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'), # US Date Format
|
||||
(r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'), # Compact Format
|
||||
(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'), # ISO 8601 Basic
|
||||
(r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format
|
||||
(r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'), # Basic Format without Separators
|
||||
(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name
|
||||
(r'(?:19|20)\d{10}', '%Y%m%d%H%M'), # Compact Numeric Format
|
||||
# Add more patterns as needed
|
||||
]
|
||||
|
||||
def parse(text):
|
||||
matches = []
|
||||
for pattern, _ in timestamp_patterns:
|
||||
for match in re.finditer(pattern, text):
|
||||
timestamp_str = match.group()
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((timestamp_str, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
21
data/parser/url.py
Normal file
21
data/parser/url.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import tldextract
|
||||
import re
|
||||
|
||||
def parse(text):
|
||||
# Regular expression for detecting potential URLs
|
||||
url_regex = r'\b(?:https?|ftp):\/\/[^\s]+'
|
||||
matches = []
|
||||
|
||||
for url_match in re.finditer(url_regex, text):
|
||||
full_url = url_match.group()
|
||||
|
||||
# Use tldextract to validate the domain and suffix
|
||||
extracted = tldextract.extract(full_url)
|
||||
|
||||
if extracted.domain and extracted.suffix:
|
||||
start_pos, end_pos = url_match.span()
|
||||
matches.append((full_url, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
12
data/parser/xmraddr.py
Normal file
12
data/parser/xmraddr.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import re
|
||||
|
||||
def parse(text):
|
||||
xmr_regex = r'\b4[123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz]{94}\b|\b8[123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz]{94}\b'
|
||||
matches = []
|
||||
|
||||
for match in re.finditer(xmr_regex, text):
|
||||
match_text = match.group()
|
||||
start_pos, end_pos = match.span()
|
||||
matches.append((match_text, start_pos, end_pos))
|
||||
|
||||
return matches
|
||||
Reference in New Issue
Block a user