From f721c936e8eafd3296637a63207cff13dfc665ac Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Wed, 15 Oct 2025 12:40:56 +0200 Subject: [PATCH] updates --- README.md | 221 ++++--------- browser2timesketch.py | 732 +++++++++++++++++++++++++++++++----------- 2 files changed, 599 insertions(+), 354 deletions(-) diff --git a/README.md b/README.md index f84f216..15da86d 100644 --- a/README.md +++ b/README.md @@ -1,87 +1,60 @@ # Browser History to Timesketch Converter -Converts browser history from the three major browser engines to Timesketch-compatible CSV format. +Converts browser history from Firefox, Chrome, Safari, and all Chromium-based browsers to Timesketch-compatible CSV format. -## Supported Browser Engines +## Requirements -- **Gecko** - Firefox and derivatives (Waterfox, LibreWolf, etc.) -- **Chromium** - All Chromium-based browsers (Chrome, Edge, Brave, Opera, Vivaldi, Arc, etc.) -- **WebKit** - Safari - -## Why Only Three Types? - -All Chromium-based browsers (Chrome, Edge, Brave, Opera, Vivaldi, etc.) use **identical database schemas**. There's no need to handle them differently - they all use the same History database format with the same table structures and timestamp formats. The only difference is the file location, which you provide as input. - -Similarly, all Gecko-based browsers (Firefox forks) use the same places.sqlite format. +- Python 3.6+ +- No external dependencies (standard library only) ## Usage +### Simple (Auto-detect browser type) ```bash -python browser2timesketch.py -b -i -o +python browser2timesketch.py -i ``` -### Arguments +### With Options +```bash +python browser2timesketch.py [OPTIONS] -i +``` -- `-b, --browser`: Browser engine type - - `firefox` or `gecko` - For Firefox and Firefox-based browsers - - `chromium` - For all Chromium-based browsers - - `safari` or `webkit` - For Safari -- `-i, --input`: Path to browser history database file -- `-o, --output`: Output CSV file path (optional, default: browser_history_timesketch.csv) -- `--browser-name`: Custom browser name for the data_type field (optional) +## Command-Line Arguments -## Database File Locations +| Argument | Required | Description | +|----------|----------|-------------| +| `-i`, `--input` | Yes | Path to browser history database file | +| `-b`, `--browser` | No | Browser type: `firefox`, `chromium`, `safari`, or `auto` (default: auto) | +| `-o`, `--output` | No | Output CSV file path (default: auto-generated) | +| `--browser-name` | No | Custom browser name for data_type field (e.g., "Brave", "Edge") | -### How to Find Your Profile Path +## Finding Browser Database Files + +### Firefox (all platforms) -#### Gecko / Firefox 1. Open Firefox -2. Type `about:support` in the address bar and press Enter +2. Type `about:support` in address bar 3. Look for **Profile Folder** or **Profile Directory** -4. Click "Open Folder" / "Open Directory" button, or note the path shown -5. The `places.sqlite` file is in this directory - -Alternative: Type `about:profiles` to see all profiles and their locations. - -#### Chromium (Chrome/Edge/Brave/Opera/Vivaldi/etc.) -1. Open your Chromium-based browser -2. Type `chrome://version/` in the address bar and press Enter -3. Look for **Profile Path** - this shows the full path to your profile directory -4. The `History` file (no extension) is in this directory - -Note: For browsers based on Chromium, use the same URL even if it's not Chrome: -- Edge: `edge://version/` -- Brave: `brave://version/` -- Opera: `opera://about/` -- Vivaldi: `vivaldi://about/` - -#### WebKit / Safari -Safari's history database is always at the same location on macOS: -`~/Library/Safari/History.db` - -To view in Finder: -1. Open Finder -2. Press `Cmd + Shift + G` (Go to Folder) -3. Type `~/Library/Safari/` -4. Press Enter - -### Standard Profile Locations - -If you prefer to navigate directly to the standard locations: - -### Gecko / Firefox - -**Database file:** `places.sqlite` +4. Click **Open Folder** button +5. Find `places.sqlite` in that folder +**Standard locations:** - **Linux:** `~/.mozilla/firefox//places.sqlite` - **macOS:** `~/Library/Application Support/Firefox/Profiles//places.sqlite` - **Windows:** `%APPDATA%\Mozilla\Firefox\Profiles\\places.sqlite` -### Chromium (Chrome/Edge/Brave/Opera/Vivaldi/etc.) +### Chrome, Edge, Brave, Opera, Vivaldi (all Chromium browsers) -**Database file:** `History` (no file extension) +1. Open your browser +2. Type `chrome://version/` in address bar + - For Edge: `edge://version/` + - For Brave: `brave://version/` + - For Opera: `opera://about/` + - For Vivaldi: `vivaldi://about/` +3. Look for **Profile Path** +4. Find `History` file (no extension) in that folder -All Chromium browsers use the same database format. Only the location differs: +**Standard locations:** **Google Chrome:** - **Linux:** `~/.config/google-chrome/Default/History` @@ -108,124 +81,38 @@ All Chromium browsers use the same database format. Only the location differs: - **macOS:** `~/Library/Application Support/Vivaldi/Default/History` - **Windows:** `%LOCALAPPDATA%\Vivaldi\User Data\Default\History` -### WebKit / Safari +### Safari (macOS only) -**Database file:** `History.db` +**Location:** `~/Library/Safari/History.db` -- **macOS:** `~/Library/Safari/History.db` +**To open in Finder:** +1. Press `Cmd + Shift + G` +2. Type `~/Library/Safari/` +3. Press Enter ## Examples -### Firefox (or any Gecko-based browser) +### Auto-detect (simplest) ```bash -# Linux -python browser2timesketch.py -b firefox -i ~/.mozilla/firefox/xyz123.default/places.sqlite -o firefox_history.csv - -# macOS -python browser2timesketch.py -b gecko -i "~/Library/Application Support/Firefox/Profiles/xyz123.default/places.sqlite" -o firefox_history.csv - -# Windows -python browser2timesketch.py -b firefox -i "C:\Users\YourUser\AppData\Roaming\Mozilla\Firefox\Profiles\xyz123.default\places.sqlite" -o firefox_history.csv +python browser2timesketch.py -i ~/.mozilla/firefox/abc123.default/places.sqlite +python browser2timesketch.py -i ~/.config/google-chrome/Default/History +python browser2timesketch.py -i ~/Library/Safari/History.db ``` -### Chrome (or any Chromium-based browser) +### Specify browser type ```bash -# Linux - Chrome -python browser2timesketch.py -b chromium -i ~/.config/google-chrome/Default/History -o chrome_history.csv - -# macOS - Chrome -python browser2timesketch.py -b chromium -i "~/Library/Application Support/Google/Chrome/Default/History" -o chrome_history.csv - -# Windows - Chrome -python browser2timesketch.py -b chromium -i "C:\Users\YourUser\AppData\Local\Google\Chrome\User Data\Default\History" -o chrome_history.csv - -# Linux - Brave with custom label -python browser2timesketch.py -b chromium --browser-name "Brave" -i ~/.config/BraveSoftware/Brave-Browser/Default/History -o brave_history.csv - -# Windows - Edge -python browser2timesketch.py -b chromium -i "C:\Users\YourUser\AppData\Local\Microsoft\Edge\User Data\Default\History" -o edge_history.csv +python browser2timesketch.py -b firefox -i places.sqlite -o firefox.csv +python browser2timesketch.py -b chromium -i History -o chrome.csv +python browser2timesketch.py -b safari -i History.db -o safari.csv ``` -### Safari +### With custom browser name ```bash -# macOS -python browser2timesketch.py -b safari -i ~/Library/Safari/History.db -o safari_history.csv - -# Or using the webkit alias -python browser2timesketch.py -b webkit -i ~/Library/Safari/History.db -o safari_history.csv +python browser2timesketch.py --browser-name "Brave" -i ~/.config/BraveSoftware/Brave-Browser/Default/History ``` -## Output Format +## Notes -The script generates a CSV file with Timesketch-compatible fields: - -| Field | Description | All Browsers | -|-------|-------------|--------------| -| `timestamp` | Unix timestamp in microseconds | ✓ | -| `datetime` | ISO 8601 formatted datetime | ✓ | -| `timestamp_desc` | Description of timestamp | ✓ | -| `message` | Human-readable event description | ✓ | -| `url` | The visited URL | ✓ | -| `title` | Page title | ✓ | -| `data_type` | Source identifier | ✓ | -| `visit_type` | Type of visit | Gecko, Chromium | -| `visit_duration_us` | Visit duration in microseconds | Chromium only | -| `total_visits` | Total visits to this URL | Chromium only | -| `typed_count` | Times URL was typed | Chromium only | - -## Browser Engine Details - -### Timestamp Formats - -Each browser engine uses a different timestamp format: - -- **Gecko (Firefox):** Microseconds since Unix epoch (1970-01-01 00:00:00 UTC) -- **Chromium:** Microseconds since Windows epoch (1601-01-01 00:00:00 UTC) -- **WebKit (Safari):** Seconds since Cocoa epoch (2001-01-01 00:00:00 UTC) - -The script automatically converts all timestamps to Unix microseconds for Timesketch. - -### Database Schemas - -- **Gecko:** Uses `moz_historyvisits` and `moz_places` tables in `places.sqlite` -- **Chromium:** Uses `visits` and `urls` tables in `History` database -- **WebKit:** Uses `history_visits` and `history_items` tables in `History.db` - -## Important Notes - -1. **Close the browser** before running the script to avoid database lock errors -2. **Copy the database file** to a temporary location if you want to avoid potential issues -3. **Handle output carefully** - the CSV contains your complete browsing history -4. Different browsers may have multiple profiles - make sure you're pointing to the correct profile directory -5. On Windows, use quotes around paths that contain spaces - -## Troubleshooting - -### Database is locked -- Close the browser completely -- Copy the database file to a temporary location and run the script on the copy - -### File not found -- Verify the profile directory name (the random string like `xyz123.default`) -- Check that the browser has been used and has history -- On macOS, use tab completion or check the exact path - -### Permission denied -- Run with appropriate permissions -- On Linux/macOS, check file permissions with `ls -l` -- On Windows, run as Administrator if needed - -## Requirements - -- Python 3.6 or higher -- No external dependencies (uses only standard library) - -## Privacy and Security - -This tool exports your complete browsing history. The output file contains: -- All visited URLs -- Page titles -- Visit timestamps -- Visit types and patterns - -Handle the output files appropriately and delete them when no longer needed. \ No newline at end of file +- Close your browser before running to avoid database locks (or the script will use read-only mode) +- Output contains complete browsing history - handle securely +- On Windows, use quotes around paths with spaces \ No newline at end of file diff --git a/browser2timesketch.py b/browser2timesketch.py index 9f10259..65df3c9 100755 --- a/browser2timesketch.py +++ b/browser2timesketch.py @@ -9,11 +9,144 @@ Supports: Gecko (Firefox), Chromium (Chrome/Edge/Brave/etc.), WebKit (Safari) import sqlite3 import csv import argparse +import sys from datetime import datetime, timedelta from pathlib import Path +from typing import Tuple, Optional, Dict, Any, List -def convert_gecko_timestamp(gecko_timestamp): +class BrowserDetectionError(Exception): + """Raised when browser type cannot be detected""" + pass + + +class DatabaseValidationError(Exception): + """Raised when database validation fails""" + pass + + +class TimestampValidationError(Exception): + """Raised when timestamp validation fails""" + pass + + +def validate_sqlite_database(db_path: str) -> None: + """ + Validate that the file is a SQLite database and is accessible. + + Args: + db_path: Path to database file + + Raises: + DatabaseValidationError: If validation fails + """ + path = Path(db_path) + + if not path.exists(): + raise DatabaseValidationError(f"Database file not found: {db_path}") + + if not path.is_file(): + raise DatabaseValidationError(f"Path is not a file: {db_path}") + + # Try to open as SQLite database + try: + conn = sqlite3.connect(f'file:{db_path}?mode=ro', uri=True) + cursor = conn.cursor() + # Check if it's a valid SQLite database by querying sqlite_master + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1") + conn.close() + except sqlite3.DatabaseError as e: + raise DatabaseValidationError(f"Not a valid SQLite database: {db_path}. Error: {e}") + except sqlite3.OperationalError as e: + raise DatabaseValidationError(f"Cannot access database (may be locked or corrupted): {db_path}. Error: {e}") + + +def detect_browser_type(db_path: str) -> str: + """ + Auto-detect browser type by examining database schema. + + Args: + db_path: Path to database file + + Returns: + Detected browser type: 'gecko', 'chromium', or 'webkit' + + Raises: + BrowserDetectionError: If browser type cannot be determined + """ + try: + conn = sqlite3.connect(f'file:{db_path}?mode=ro', uri=True) + cursor = conn.cursor() + + # Get all table names + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = {row[0] for row in cursor.fetchall()} + + conn.close() + + # Check for Gecko/Firefox tables + if 'moz_historyvisits' in tables and 'moz_places' in tables: + return 'gecko' + + # Check for Chromium tables + if 'visits' in tables and 'urls' in tables: + return 'chromium' + + # Check for WebKit/Safari tables + if 'history_visits' in tables and 'history_items' in tables: + return 'webkit' + + raise BrowserDetectionError( + f"Cannot determine browser type. Found tables: {', '.join(sorted(tables))}\n" + f"Expected one of:\n" + f" - Gecko/Firefox: moz_historyvisits, moz_places\n" + f" - Chromium: visits, urls\n" + f" - WebKit/Safari: history_visits, history_items" + ) + + except sqlite3.Error as e: + raise BrowserDetectionError(f"Error reading database schema: {e}") + + +def validate_timestamp(unix_microseconds: int, browser_type: str) -> None: + """ + Validate that a timestamp is within reasonable bounds. + + Args: + unix_microseconds: Timestamp in Unix microseconds + browser_type: Browser type for error messages + + Raises: + TimestampValidationError: If timestamp is unreasonable + """ + if unix_microseconds <= 0: + return # Allow 0 for missing timestamps + + # Convert to seconds for validation + timestamp_seconds = unix_microseconds / 1000000 + + # Check if timestamp is reasonable (between 1990 and 2040) + min_date = datetime(1990, 1, 1) + max_date = datetime(2040, 1, 1) + min_seconds = min_date.timestamp() + max_seconds = max_date.timestamp() + + if timestamp_seconds < min_seconds: + dt = datetime.utcfromtimestamp(timestamp_seconds) + raise TimestampValidationError( + f"Timestamp appears too old: {dt.strftime('%Y-%m-%d %H:%M:%S')} (before 1990). " + f"This may indicate a timestamp conversion error for {browser_type}." + ) + + if timestamp_seconds > max_seconds: + dt = datetime.utcfromtimestamp(timestamp_seconds) + raise TimestampValidationError( + f"Timestamp appears to be in the future: {dt.strftime('%Y-%m-%d %H:%M:%S')} (after 2040). " + f"This may indicate a timestamp conversion error for {browser_type}." + ) + + +def convert_gecko_timestamp(gecko_timestamp: Optional[int]) -> Tuple[int, str]: """ Convert Gecko/Firefox timestamp (microseconds since Unix epoch) to ISO format. Firefox stores timestamps as microseconds since 1970-01-01 00:00:00 UTC. @@ -24,16 +157,19 @@ def convert_gecko_timestamp(gecko_timestamp): Returns: tuple: (microseconds, ISO formatted datetime string) """ - if gecko_timestamp is None: + if gecko_timestamp is None or gecko_timestamp == 0: return 0, "" + # Validate + validate_timestamp(gecko_timestamp, "Gecko/Firefox") + # Convert microseconds to seconds timestamp_seconds = gecko_timestamp / 1000000 dt = datetime.utcfromtimestamp(timestamp_seconds) return gecko_timestamp, dt.strftime('%Y-%m-%dT%H:%M:%S+00:00') -def convert_chromium_timestamp(chromium_timestamp): +def convert_chromium_timestamp(chromium_timestamp: Optional[int]) -> Tuple[int, str]: """ Convert Chromium timestamp to Unix microseconds and ISO format. Chromium stores timestamps as microseconds since 1601-01-01 00:00:00 UTC (Windows epoch). @@ -58,11 +194,14 @@ def convert_chromium_timestamp(chromium_timestamp): # Convert to Unix microseconds for Timesketch unix_microseconds = int(timestamp_seconds * 1000000) + # Validate + validate_timestamp(unix_microseconds, "Chromium") + dt = datetime.utcfromtimestamp(timestamp_seconds) return unix_microseconds, dt.strftime('%Y-%m-%dT%H:%M:%S+00:00') -def convert_webkit_timestamp(webkit_timestamp): +def convert_webkit_timestamp(webkit_timestamp: Optional[float]) -> Tuple[int, str]: """ Convert WebKit/Safari timestamp to Unix microseconds and ISO format. Safari stores timestamps as seconds (with decimal) since 2001-01-01 00:00:00 UTC (Cocoa/Core Data epoch). @@ -87,11 +226,82 @@ def convert_webkit_timestamp(webkit_timestamp): # Convert to Unix microseconds for Timesketch unix_microseconds = int(timestamp_seconds * 1000000) + # Validate + validate_timestamp(unix_microseconds, "WebKit/Safari") + dt = datetime.utcfromtimestamp(timestamp_seconds) return unix_microseconds, dt.strftime('%Y-%m-%dT%H:%M:%S+00:00') -def extract_chromium_history(db_path, output_csv, browser_name=None): +def write_timesketch_csv(output_csv: str, fieldnames: List[str], rows: List[Dict[str, Any]]) -> None: + """ + Write history data to Timesketch-compatible CSV format. + + Args: + output_csv: Path to output CSV file + fieldnames: List of CSV field names + rows: List of row dictionaries to write + """ + with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + for row in rows: + writer.writerow(row) + + +def connect_database_readonly(db_path: str) -> sqlite3.Connection: + """ + Connect to database in read-only mode to avoid lock issues. + + Args: + db_path: Path to database file + + Returns: + SQLite connection object + + Raises: + sqlite3.OperationalError: If database is locked or inaccessible + """ + try: + # Use URI with read-only mode to avoid locking issues + conn = sqlite3.connect(f'file:{db_path}?mode=ro', uri=True) + return conn + except sqlite3.OperationalError as e: + raise sqlite3.OperationalError( + f"Cannot open database (it may be locked by the browser): {db_path}\n" + f"Please close {db_path.split('/')[-2] if '/' in db_path else 'the browser'} " + f"and try again, or copy the database file to a temporary location.\n" + f"Original error: {e}" + ) + + +def validate_browser_schema(conn: sqlite3.Connection, expected_tables: List[str], browser_name: str) -> None: + """ + Validate that required tables exist in the database. + + Args: + conn: Database connection + expected_tables: List of required table names + browser_name: Browser name for error messages + + Raises: + DatabaseValidationError: If required tables are missing + """ + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + existing_tables = {row[0] for row in cursor.fetchall()} + + missing_tables = set(expected_tables) - existing_tables + if missing_tables: + raise DatabaseValidationError( + f"Database does not appear to be a valid {browser_name} history database.\n" + f"Missing required tables: {', '.join(missing_tables)}\n" + f"Found tables: {', '.join(sorted(existing_tables))}" + ) + + +def extract_chromium_history(db_path: str, output_csv: str, browser_name: Optional[str] = None) -> int: """ Extract browser history from Chromium-based browsers and convert to Timesketch CSV. Works with all Chromium-based browsers: Chrome, Edge, Brave, Chromium, Opera, Vivaldi, etc. @@ -100,17 +310,23 @@ def extract_chromium_history(db_path, output_csv, browser_name=None): db_path: Path to Chromium History database output_csv: Path to output CSV file browser_name: Optional custom name for data_type field (default: "Chromium") + + Returns: + Number of entries processed + + Raises: + DatabaseValidationError: If database validation fails + sqlite3.Error: If database query fails """ - if browser_name is None: browser_name = "Chromium" - # Check if database exists - if not Path(db_path).exists(): - raise FileNotFoundError(f"Chromium database not found: {db_path}") + # Connect to database + conn = connect_database_readonly(db_path) + + # Validate schema + validate_browser_schema(conn, ['visits', 'urls'], browser_name) - # Connect to Chromium SQLite database - conn = sqlite3.connect(db_path) cursor = conn.cursor() # Query to extract history visits with URL information @@ -129,11 +345,14 @@ def extract_chromium_history(db_path, output_csv, browser_name=None): ORDER BY visits.visit_time """ - cursor.execute(query) - results = cursor.fetchall() + try: + cursor.execute(query) + results = cursor.fetchall() + except sqlite3.Error as e: + conn.close() + raise sqlite3.Error(f"Error querying {browser_name} history: {e}") # Transition type mapping (Chromium transition types) - # Core types (bits 0-7) transition_types = { 0: "Link", 1: "Typed", @@ -148,66 +367,72 @@ def extract_chromium_history(db_path, output_csv, browser_name=None): 10: "Keyword_Generated" } - # Write to Timesketch CSV format - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - fieldnames = [ - 'timestamp', - 'datetime', - 'timestamp_desc', - 'message', - 'url', - 'title', - 'visit_type', - 'visit_duration_us', - 'total_visits', - 'typed_count', - 'data_type' - ] + rows = [] + validation_errors = [] + + for idx, row in enumerate(results): + chromium_timestamp = row[0] + url = row[1] or "" + title = row[2] or "(No title)" + transition = row[3] + visit_duration = row[4] or 0 + visit_count = row[5] or 0 + typed_count = row[6] or 0 - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() + # Extract core transition type (lower 8 bits) + core_transition = transition & 0xFF + transition_name = transition_types.get(core_transition, f"Unknown({core_transition})") - for row in results: - chromium_timestamp = row[0] - url = row[1] or "" - title = row[2] or "(No title)" - transition = row[3] - visit_duration = row[4] or 0 - visit_count = row[5] or 0 - typed_count = row[6] or 0 - last_visit = row[7] - - # Extract core transition type (lower 8 bits) - core_transition = transition & 0xFF - transition_name = transition_types.get(core_transition, f"Unknown({core_transition})") - - # Convert timestamp + # Convert timestamp + try: unix_microseconds, iso_datetime = convert_chromium_timestamp(chromium_timestamp) - - # Construct message - message = f"Visited: {title}" - - writer.writerow({ - 'timestamp': unix_microseconds, - 'datetime': iso_datetime, - 'timestamp_desc': 'Visit Time', - 'message': message, - 'url': url, - 'title': title, - 'visit_type': transition_name, - 'visit_duration_us': visit_duration, - 'total_visits': visit_count, - 'typed_count': typed_count, - 'data_type': f'{browser_name.lower()}:history:visit' - }) + except TimestampValidationError as e: + validation_errors.append(f"Entry {idx + 1}: {e}") + if len(validation_errors) <= 3: # Only store first few errors + continue + else: + break # Too many errors, likely a systematic issue + + # Construct message + message = f"Visited: {title}" + + rows.append({ + 'timestamp': unix_microseconds, + 'datetime': iso_datetime, + 'timestamp_desc': 'Visit Time', + 'message': message, + 'url': url, + 'title': title, + 'visit_type': transition_name, + 'visit_duration_us': visit_duration, + 'total_visits': visit_count, + 'typed_count': typed_count, + 'data_type': f'{browser_name.lower()}:history:visit' + }) conn.close() - print(f"Successfully converted {len(results)} history entries from {browser_name}") - print(f"Output saved to: {output_csv}") + # Report validation errors if any + if validation_errors: + print(f"Warning: Found {len(validation_errors)} timestamp validation errors:", file=sys.stderr) + for error in validation_errors[:3]: + print(f" {error}", file=sys.stderr) + if len(validation_errors) > 3: + print(f" ... and {len(validation_errors) - 3} more errors", file=sys.stderr) + print(f"Continuing with {len(rows)} valid entries...", file=sys.stderr) + + # Write CSV + fieldnames = [ + 'timestamp', 'datetime', 'timestamp_desc', 'message', + 'url', 'title', 'visit_type', 'visit_duration_us', + 'total_visits', 'typed_count', 'data_type' + ] + write_timesketch_csv(output_csv, fieldnames, rows) + + return len(rows) -def extract_gecko_history(db_path, output_csv, browser_name=None): +def extract_gecko_history(db_path: str, output_csv: str, browser_name: Optional[str] = None) -> int: """ Extract browser history from Gecko-based browsers (Firefox) and convert to Timesketch CSV. Works with Firefox and Firefox derivatives (Waterfox, LibreWolf, etc.) @@ -216,17 +441,23 @@ def extract_gecko_history(db_path, output_csv, browser_name=None): db_path: Path to Gecko places.sqlite database output_csv: Path to output CSV file browser_name: Optional custom name for data_type field (default: "Firefox") + + Returns: + Number of entries processed + + Raises: + DatabaseValidationError: If database validation fails + sqlite3.Error: If database query fails """ - if browser_name is None: browser_name = "Firefox" - # Check if database exists - if not Path(db_path).exists(): - raise FileNotFoundError(f"Gecko database not found: {db_path}") + # Connect to database + conn = connect_database_readonly(db_path) + + # Validate schema + validate_browser_schema(conn, ['moz_historyvisits', 'moz_places'], browser_name) - # Connect to Firefox SQLite database - conn = sqlite3.connect(db_path) cursor = conn.cursor() # Query to extract history visits with URL information @@ -243,8 +474,12 @@ def extract_gecko_history(db_path, output_csv, browser_name=None): ORDER BY moz_historyvisits.visit_date """ - cursor.execute(query) - results = cursor.fetchall() + try: + cursor.execute(query) + results = cursor.fetchall() + except sqlite3.Error as e: + conn.close() + raise sqlite3.Error(f"Error querying {browser_name} history: {e}") # Visit type mapping (Firefox visit types) visit_types = { @@ -259,59 +494,66 @@ def extract_gecko_history(db_path, output_csv, browser_name=None): 9: "Reload" } - # Write to Timesketch CSV format - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - # Timesketch expected fields - fieldnames = [ - 'timestamp', - 'datetime', - 'timestamp_desc', - 'message', - 'url', - 'title', - 'visit_type', - 'data_type' - ] + rows = [] + validation_errors = [] + + for idx, row in enumerate(results): + timestamp_us = row[0] + url = row[1] or "" + title = row[2] or "(No title)" + description = row[3] or "" + visit_type_id = row[4] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() + visit_type_name = visit_types.get(visit_type_id, f"Unknown({visit_type_id})") - for row in results: - timestamp_us = row[0] # Firefox timestamp in microseconds - url = row[1] or "" - title = row[2] or "(No title)" - description = row[3] or "" - visit_type_id = row[4] - from_visit = row[5] - - visit_type_name = visit_types.get(visit_type_id, f"Unknown({visit_type_id})") - - # Convert timestamp + # Convert timestamp + try: unix_microseconds, iso_datetime = convert_gecko_timestamp(timestamp_us) - - # Construct message - message = f"Visited: {title}" - if description: - message += f" - {description}" - - writer.writerow({ - 'timestamp': unix_microseconds, - 'datetime': iso_datetime, - 'timestamp_desc': 'Visit Time', - 'message': message, - 'url': url, - 'title': title, - 'visit_type': visit_type_name, - 'data_type': f'{browser_name.lower()}:history:visit' - }) + except TimestampValidationError as e: + validation_errors.append(f"Entry {idx + 1}: {e}") + if len(validation_errors) <= 3: + continue + else: + break + + # Construct message + message = f"Visited: {title}" + if description: + message += f" - {description}" + + rows.append({ + 'timestamp': unix_microseconds, + 'datetime': iso_datetime, + 'timestamp_desc': 'Visit Time', + 'message': message, + 'url': url, + 'title': title, + 'visit_type': visit_type_name, + 'data_type': f'{browser_name.lower()}:history:visit' + }) conn.close() - print(f"Successfully converted {len(results)} history entries from {browser_name}") - print(f"Output saved to: {output_csv}") + # Report validation errors if any + if validation_errors: + print(f"Warning: Found {len(validation_errors)} timestamp validation errors:", file=sys.stderr) + for error in validation_errors[:3]: + print(f" {error}", file=sys.stderr) + if len(validation_errors) > 3: + print(f" ... and {len(validation_errors) - 3} more errors", file=sys.stderr) + print(f"Continuing with {len(rows)} valid entries...", file=sys.stderr) + + # Write CSV + fieldnames = [ + 'timestamp', 'datetime', 'timestamp_desc', 'message', + 'url', 'title', 'visit_type', 'data_type' + ] + write_timesketch_csv(output_csv, fieldnames, rows) + + return len(rows) -def extract_webkit_history(db_path, output_csv, browser_name=None): +def extract_webkit_history(db_path: str, output_csv: str, browser_name: Optional[str] = None) -> int: """ Extract browser history from WebKit-based browsers (Safari) and convert to Timesketch CSV. @@ -319,17 +561,23 @@ def extract_webkit_history(db_path, output_csv, browser_name=None): db_path: Path to Safari History.db database output_csv: Path to output CSV file browser_name: Optional custom name for data_type field (default: "Safari") + + Returns: + Number of entries processed + + Raises: + DatabaseValidationError: If database validation fails + sqlite3.Error: If database query fails """ - if browser_name is None: browser_name = "Safari" - # Check if database exists - if not Path(db_path).exists(): - raise FileNotFoundError(f"WebKit database not found: {db_path}") + # Connect to database + conn = connect_database_readonly(db_path) + + # Validate schema + validate_browser_schema(conn, ['history_visits', 'history_items'], browser_name) - # Connect to Safari SQLite database - conn = sqlite3.connect(db_path) cursor = conn.cursor() # Query to extract history visits with URL information @@ -344,52 +592,102 @@ def extract_webkit_history(db_path, output_csv, browser_name=None): ORDER BY history_visits.visit_time """ - cursor.execute(query) - results = cursor.fetchall() + try: + cursor.execute(query) + results = cursor.fetchall() + except sqlite3.Error as e: + conn.close() + raise sqlite3.Error(f"Error querying {browser_name} history: {e}") - # Write to Timesketch CSV format - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - fieldnames = [ - 'timestamp', - 'datetime', - 'timestamp_desc', - 'message', - 'url', - 'title', - 'data_type' - ] + rows = [] + validation_errors = [] + + for idx, row in enumerate(results): + webkit_timestamp = row[0] + url = row[1] or "" + title = row[2] or row[3] or "(No title)" - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - - for row in results: - webkit_timestamp = row[0] - url = row[1] or "" - title = row[2] or row[3] or "(No title)" # Use visit_title as fallback - - # Convert timestamp + # Convert timestamp + try: unix_microseconds, iso_datetime = convert_webkit_timestamp(webkit_timestamp) - - # Construct message - message = f"Visited: {title}" - - writer.writerow({ - 'timestamp': unix_microseconds, - 'datetime': iso_datetime, - 'timestamp_desc': 'Visit Time', - 'message': message, - 'url': url, - 'title': title, - 'data_type': f'{browser_name.lower()}:history:visit' - }) + except TimestampValidationError as e: + validation_errors.append(f"Entry {idx + 1}: {e}") + if len(validation_errors) <= 3: + continue + else: + break + + # Construct message + message = f"Visited: {title}" + + rows.append({ + 'timestamp': unix_microseconds, + 'datetime': iso_datetime, + 'timestamp_desc': 'Visit Time', + 'message': message, + 'url': url, + 'title': title, + 'data_type': f'{browser_name.lower()}:history:visit' + }) conn.close() - print(f"Successfully converted {len(results)} history entries from {browser_name}") - print(f"Output saved to: {output_csv}") + # Report validation errors if any + if validation_errors: + print(f"Warning: Found {len(validation_errors)} timestamp validation errors:", file=sys.stderr) + for error in validation_errors[:3]: + print(f" {error}", file=sys.stderr) + if len(validation_errors) > 3: + print(f" ... and {len(validation_errors) - 3} more errors", file=sys.stderr) + print(f"Continuing with {len(rows)} valid entries...", file=sys.stderr) + + # Write CSV + fieldnames = [ + 'timestamp', 'datetime', 'timestamp_desc', 'message', + 'url', 'title', 'data_type' + ] + write_timesketch_csv(output_csv, fieldnames, rows) + + return len(rows) -def main(): +def generate_default_output_filename(browser_type: str, input_path: str) -> str: + """ + Generate a sensible default output filename based on browser type and input. + + Args: + browser_type: Browser type (gecko, chromium, webkit) + input_path: Input database path + + Returns: + Generated output filename + """ + # Extract browser name from path if possible + path_lower = input_path.lower() + + browser_names = { + 'firefox': 'firefox', + 'chrome': 'chrome', + 'edge': 'edge', + 'brave': 'brave', + 'opera': 'opera', + 'vivaldi': 'vivaldi', + 'safari': 'safari', + } + + detected_name = None + for name_key, name_value in browser_names.items(): + if name_key in path_lower: + detected_name = name_value + break + + if detected_name: + return f"{detected_name}_history_timesketch.csv" + else: + return f"{browser_type}_history_timesketch.csv" + + +def main() -> int: parser = argparse.ArgumentParser( description='Convert browser history to Timesketch CSV format', formatter_class=argparse.RawDescriptionHelpFormatter, @@ -398,6 +696,7 @@ Browser Engine Types: gecko, firefox - Gecko-based browsers (Firefox, Waterfox, LibreWolf, etc.) chromium - Chromium-based browsers (Chrome, Edge, Brave, Opera, Vivaldi, etc.) webkit, safari - WebKit-based browsers (Safari) + auto - Auto-detect browser type from database schema All Chromium-based browsers (Chrome, Edge, Brave, Opera, Vivaldi) use identical database schemas and can be processed with the "chromium" option. Use --browser-name to customize @@ -420,17 +719,20 @@ HOW TO FIND YOUR PROFILE PATH: Always at: ~/Library/Safari/History.db Example usage: - # Firefox - python browser_to_timesketch.py -b firefox -i ~/.mozilla/firefox/xyz.default/places.sqlite -o output.csv + # Auto-detect browser type + python browser2timesketch.py -i ~/.mozilla/firefox/xyz.default/places.sqlite + + # Firefox with custom output + python browser2timesketch.py -b firefox -i ~/.mozilla/firefox/xyz.default/places.sqlite -o firefox.csv # Any Chromium browser (Chrome, Edge, Brave, etc.) - python browser_to_timesketch.py -b chromium -i ~/.config/google-chrome/Default/History -o output.csv + python browser2timesketch.py -b chromium -i ~/.config/google-chrome/Default/History -o output.csv # Chromium browser with custom label - python browser_to_timesketch.py -b chromium --browser-name "Brave" -i ~/.config/BraveSoftware/Brave-Browser/Default/History -o output.csv + python browser2timesketch.py -b chromium --browser-name "Brave" -i ~/.config/BraveSoftware/Brave-Browser/Default/History # Safari (macOS) - python browser_to_timesketch.py -b safari -i ~/Library/Safari/History.db -o output.csv + python browser2timesketch.py -b safari -i ~/Library/Safari/History.db Database Locations: Gecko/Firefox: @@ -453,16 +755,16 @@ Database Locations: WebKit/Safari: macOS: ~/Library/Safari/History.db -Note: Close the browser before running this script to avoid database lock issues. -You may want to copy the database file to a temporary location first. +Note: The script uses read-only mode to avoid database lock issues, but closing +the browser is still recommended for best results. """ ) parser.add_argument( '-b', '--browser', - required=True, - choices=['gecko', 'firefox', 'chromium', 'webkit', 'safari'], - help='Browser engine type (firefox and gecko are aliases, safari and webkit are aliases)' + choices=['gecko', 'firefox', 'chromium', 'webkit', 'safari', 'auto'], + default='auto', + help='Browser engine type (default: auto-detect)' ) parser.add_argument( @@ -473,37 +775,93 @@ You may want to copy the database file to a temporary location first. parser.add_argument( '-o', '--output', - default='browser_history_timesketch.csv', - help='Output CSV file path (default: browser_history_timesketch.csv)' + help='Output CSV file path (default: auto-generated based on browser type)' ) parser.add_argument( '--browser-name', - default=None, help='Custom browser name for the data_type field (e.g., "Chrome", "Brave", "Edge")' ) args = parser.parse_args() try: - # Normalize browser type + # Validate database file + print(f"Validating database: {args.input}") + validate_sqlite_database(args.input) + print("✓ Database is valid SQLite file") + + # Detect or validate browser type browser_type = args.browser.lower() - if browser_type in ['gecko', 'firefox']: - extract_gecko_history(args.input, args.output, args.browser_name) - elif browser_type == 'chromium': - extract_chromium_history(args.input, args.output, args.browser_name) - elif browser_type in ['webkit', 'safari']: - extract_webkit_history(args.input, args.output, args.browser_name) + if browser_type == 'auto': + print("Auto-detecting browser type...") + browser_type = detect_browser_type(args.input) + print(f"✓ Detected browser type: {browser_type}") + else: + # Normalize aliases + if browser_type == 'firefox': + browser_type = 'gecko' + elif browser_type == 'safari': + browser_type = 'webkit' + # Validate that the database matches the specified type + detected_type = detect_browser_type(args.input) + if detected_type != browser_type: + print(f"Warning: You specified '{args.browser}' but database appears to be '{detected_type}'", + file=sys.stderr) + response = input("Continue anyway? [y/N]: ") + if response.lower() != 'y': + return 1 + + # Generate output filename if not provided + if args.output: + output_csv = args.output + else: + output_csv = generate_default_output_filename(browser_type, args.input) + print(f"Using output filename: {output_csv}") + + # Extract history based on browser type + print(f"Extracting history from {browser_type} database...") + + if browser_type == 'gecko': + num_entries = extract_gecko_history(args.input, output_csv, args.browser_name) + elif browser_type == 'chromium': + num_entries = extract_chromium_history(args.input, output_csv, args.browser_name) + elif browser_type == 'webkit': + num_entries = extract_webkit_history(args.input, output_csv, args.browser_name) + else: + raise ValueError(f"Unknown browser type: {browser_type}") + + print(f"\n✓ Successfully converted {num_entries} history entries") + print(f"✓ Output saved to: {output_csv}") + return 0 + + except DatabaseValidationError as e: + print(f"Database Validation Error: {e}", file=sys.stderr) + return 1 + except BrowserDetectionError as e: + print(f"Browser Detection Error: {e}", file=sys.stderr) + return 1 + except TimestampValidationError as e: + print(f"Timestamp Validation Error: {e}", file=sys.stderr) + print("This indicates a systematic problem with timestamp conversion.", file=sys.stderr) + return 1 + except sqlite3.Error as e: + print(f"Database Error: {e}", file=sys.stderr) + return 1 + except FileNotFoundError as e: + print(f"File Error: {e}", file=sys.stderr) + return 1 + except KeyboardInterrupt: + print("\nOperation cancelled by user", file=sys.stderr) + return 130 except Exception as e: - print(f"Error: {e}") + print(f"Unexpected Error: {e}", file=sys.stderr) import traceback traceback.print_exc() return 1 - - return 0 if __name__ == "__main__": - exit(main()) \ No newline at end of file + sys.exit(main()) \ No newline at end of file