# Requires python3 import re import sqlite3 import subprocess import shutil import os import codecs import datetime import sys from typing import Dict, List import psycopg2 import psycopg2.extras import socket import csv class TskDbDiff(object): """Compares two TSK/Autospy SQLite databases. Attributes: gold_artifacts: autopsy_artifacts: gold_attributes: autopsy_attributes: gold_objects: autopsy_objects: artifact_comparison: attribute_comparision: report_errors: a listof_listof_String, the error messages that will be printed to screen in the run_diff method passed: a boolean, did the diff pass? autopsy_db_file: gold_db_file: """ def __init__(self, output_db, gold_db, output_dir=None, gold_bb_dump=None, gold_dump=None, verbose=False, isMultiUser=False, pgSettings=None): """Constructor for TskDbDiff. Args: output_db_path: path to output database (non-gold standard) gold_db_path: path to gold database output_dir: (optional) Path to folder where generated files will be put. gold_bb_dump: (optional) path to file where the gold blackboard dump is located gold_dump: (optional) path to file where the gold non-blackboard dump is located verbose: (optional) a boolean, if true, diff results are sent to stdout. """ self.output_db_file = output_db self.gold_db_file = gold_db self.output_dir = output_dir self.gold_bb_dump = gold_bb_dump self.gold_dump = gold_dump self._generate_gold_dump = False self._generate_gold_bb_dump = False self._bb_dump_diff = "" self._dump_diff = "" self._bb_dump = "" self._dump = "" self.verbose = verbose self.isMultiUser = isMultiUser self.pgSettings = pgSettings if self.isMultiUser and not self.pgSettings: print("Missing PostgreSQL database connection settings data.") sys.exit(1) if self.gold_bb_dump is None: self._generate_gold_bb_dump = True if self.gold_dump is None: self._generate_gold_dump = True def run_diff(self): """Compare the databases. Raises: TskDbDiffException: if an error occurs while diffing or dumping the database """ self._init_diff() id_obj_path_table = -1 # generate the gold database dumps if necessary if self._generate_gold_dump: id_obj_path_table = TskDbDiff._dump_output_db_nonbb(self.gold_db_file, self.gold_dump, self.isMultiUser, self.pgSettings) if self._generate_gold_bb_dump: TskDbDiff._dump_output_db_bb(self.gold_db_file, self.gold_bb_dump, self.isMultiUser, self.pgSettings, id_obj_path_table) # generate the output database dumps (both DB and BB) id_obj_path_table = TskDbDiff._dump_output_db_nonbb(self.output_db_file, self._dump, self.isMultiUser, self.pgSettings) TskDbDiff._dump_output_db_bb(self.output_db_file, self._bb_dump, self.isMultiUser, self.pgSettings, id_obj_path_table) # Compare non-BB dump_diff_pass = self._diff(self._dump, self.gold_dump, self._dump_diff) # Compare BB bb_dump_diff_pass = self._diff(self._bb_dump, self.gold_bb_dump, self._bb_dump_diff) self._cleanup_diff() return dump_diff_pass, bb_dump_diff_pass def _init_diff(self): """Set up the necessary files based on the arguments given at construction""" if self.output_dir is None: # No stored files self._bb_dump = TskDbDiff._get_tmp_file("BlackboardDump", ".txt") self._bb_dump_diff = TskDbDiff._get_tmp_file("BlackboardDump-Diff", ".txt") self._dump = TskDbDiff._get_tmp_file("DBDump", ".txt") self._dump_diff = TskDbDiff._get_tmp_file("DBDump-Diff", ".txt") else: self._bb_dump = os.path.join(self.output_dir, "BlackboardDump.txt") self._bb_dump_diff = os.path.join(self.output_dir, "BlackboardDump-Diff.txt") self._dump = os.path.join(self.output_dir, "DBDump.txt") self._dump_diff = os.path.join(self.output_dir, "DBDump-Diff.txt") # Sorting gold before comparing (sort behaves differently in different environments) new_bb = TskDbDiff._get_tmp_file("GoldBlackboardDump", ".txt") new_db = TskDbDiff._get_tmp_file("GoldDBDump", ".txt") if self.gold_bb_dump is not None: srtcmdlst = ["sort", self.gold_bb_dump, "-o", new_bb] subprocess.call(srtcmdlst) srtcmdlst = ["sort", self.gold_dump, "-o", new_db] subprocess.call(srtcmdlst) self.gold_bb_dump = new_bb self.gold_dump = new_db def _cleanup_diff(self): if self.output_dir is None: #cleanup temp files os.remove(self._dump) os.remove(self._bb_dump) if os.path.isfile(self._dump_diff): os.remove(self._dump_diff) if os.path.isfile(self._bb_dump_diff): os.remove(self._bb_dump_diff) if self.gold_bb_dump is None: os.remove(self.gold_bb_dump) os.remove(self.gold_dump) def _diff(self, output_file, gold_file, diff_path): """Compare two text files. Args: output_file: a pathto_File, the latest text file gold_file: a pathto_File, the gold text file diff_path: The file to write the differences to Returns False if different """ if (not os.path.isfile(output_file)): return False if (not os.path.isfile(gold_file)): return False # It is faster to read the contents in and directly compare output_data = codecs.open(output_file, "r", "utf_8").read() gold_data = codecs.open(gold_file, "r", "utf_8").read() if (gold_data == output_data): return True # If they are different, invoke 'diff' diff_file = codecs.open(diff_path, "wb", "utf_8") # Gold needs to be passed in as 1st arg and output as 2nd dffcmdlst = ["diff", gold_file, output_file] subprocess.call(dffcmdlst, stdout = diff_file) # create file path for gold files inside output folder. In case of diff, both gold and current run files # are available in the report output folder. Prefix Gold- is added to the filename. gold_file_in_output_dir = output_file[:output_file.rfind("/")] + "/Gold-" + output_file[output_file.rfind("/")+1:] shutil.copy(gold_file, gold_file_in_output_dir) return False def _dump_output_db_bb(db_file, bb_dump_file, isMultiUser, pgSettings, id_obj_path_table): """Dumps sorted text results to the given output location. Smart method that deals with a blackboard comparison to avoid issues with different IDs based on when artifacts were created. Args: db_file: a pathto_File, the output database. bb_dump_file: a pathto_File, the sorted dump file to write to """ unsorted_dump = TskDbDiff._get_tmp_file("dump_data", ".txt") if isMultiUser: conn, unused_db = db_connect(db_file, isMultiUser, pgSettings) artifact_cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) else: # Use Sqlite conn = sqlite3.connect(db_file) conn.text_factory = lambda x: x.decode("utf-8", "ignore") conn.row_factory = sqlite3.Row artifact_cursor = conn.cursor() # Get the list of all artifacts (along with type and associated file) # @@@ Could add a SORT by parent_path in here since that is how we are going to later sort it. artifact_cursor.execute("SELECT tsk_files.parent_path, tsk_files.name, blackboard_artifact_types.display_name, blackboard_artifacts.artifact_id FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id") database_log = codecs.open(unsorted_dump, "wb", "utf_8") row = artifact_cursor.fetchone() appnd = False counter = 0 artifact_count = 0 artifact_fail = 0 # Cycle through artifacts try: while (row != None): # File Name and artifact type # Remove parent object ID from Unalloc file name normalizedName = re.sub('^Unalloc_[0-9]+_', 'Unalloc_', row["name"]) if(row["parent_path"] != None): database_log.write(row["parent_path"] + normalizedName + ' ') else: database_log.write(normalizedName + ' ') if isMultiUser: attribute_cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) else: attribute_cursor = conn.cursor() looptry = True artifact_count += 1 try: art_id = "" art_id = str(row["artifact_id"]) # Get attributes for this artifact if isMultiUser: attribute_cursor.execute("SELECT blackboard_attributes.source, blackboard_attributes.attribute_type_id, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double FROM blackboard_attributes INNER JOIN blackboard_attribute_types ON blackboard_attributes.attribute_type_id = blackboard_attribute_types.attribute_type_id WHERE artifact_id = %s ORDER BY blackboard_attributes.source, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double", [art_id]) else: attribute_cursor.execute("SELECT blackboard_attributes.source, blackboard_attributes.attribute_type_id, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double FROM blackboard_attributes INNER JOIN blackboard_attribute_types ON blackboard_attributes.attribute_type_id = blackboard_attribute_types.attribute_type_id WHERE artifact_id =? ORDER BY blackboard_attributes.source, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double", [art_id]) attributes = attribute_cursor.fetchall() # Print attributes if (len(attributes) == 0): # @@@@ This should be database_log.write(' \n') row = artifact_cursor.fetchone() continue src = attributes[0][0] for attr in attributes: numvals = 0 for x in range(3, 6): if(attr[x] != None): numvals += 1 if(numvals > 1): msg = "There were too many values for attribute type: " + attr["display_name"] + " for artifact with id #" + str(row["artifact_id"]) + ".\n" if(not attr["source"] == src): msg = "There were inconsistent sources for artifact with id #" + str(row["artifact_id"]) + ".\n" try: if attr["value_type"] == 0: attr_value_as_string = str(attr["value_text"]) elif attr["value_type"] == 1: attr_value_as_string = str(attr["value_int32"]) elif attr["value_type"] == 2: attr_value_as_string = str(attr["value_int64"]) if attr["attribute_type_id"] == 36 and id_obj_path_table != -1 and int(attr_value_as_string) > 0: #normalize positive TSK_PATH_IDs from being object id to a path if the obj_id_path_table was generated attr_value_as_string = id_obj_path_table[int(attr_value_as_string)] elif attr["value_type"] == 3: attr_value_as_string = "%20.10f" % float((attr["value_double"])) #use exact format from db schema to avoid python auto format double value to (0E-10) scientific style elif attr["value_type"] == 4: attr_value_as_string = "bytes" elif attr["value_type"] == 5: attr_value_as_string = str(attr["value_int64"]) if attr["display_name"] == "Associated Artifact": attr_value_as_string = getAssociatedArtifactType(attribute_cursor, attr_value_as_string, isMultiUser) patrn = re.compile("[\n\0\a\b\r\f]") attr_value_as_string = re.sub(patrn, ' ', attr_value_as_string) if attr["source"] == "Keyword Search" and attr["display_name"] == "Keyword Preview": attr_value_as_string = "" database_log.write('') except IOError as e: print("IO error") raise TskDbDiffException("Unexpected IO error while writing to database log." + str(e)) except sqlite3.Error as e: msg = "Attributes in artifact id (in output DB)# " + str(row["artifact_id"]) + " encountered an error: " + str(e) +" .\n" print("Attributes in artifact id (in output DB)# ", str(row["artifact_id"]), " encountered an error: ", str(e)) print() looptry = False artifact_fail += 1 database_log.write('Error Extracting Attributes') database_log.close() raise TskDbDiffException(msg) finally: attribute_cursor.close() # @@@@ This should be database_log.write(' \n') row = artifact_cursor.fetchone() if(artifact_fail > 0): msg ="There were " + str(artifact_count) + " artifacts and " + str(artifact_fail) + " threw an exception while loading.\n" except Exception as e: raise TskDbDiffException("Unexpected error while dumping blackboard database: " + str(e)) finally: database_log.close() artifact_cursor.close() conn.close() # Now sort the file srtcmdlst = ["sort", unsorted_dump, "-o", bb_dump_file] subprocess.call(srtcmdlst) # for key, val in get_pg_table_columns(psycopg2.connect(dbname="jythontest1_20200414_124128", user="postgres", password="password12345")).items(): # for key, val in get_sqlite_table_columns(sqlite3.connect(r"C:\Users\gregd\Documents\cases\7500-take4\autopsy.db")).items(): # print(f"{key}: {val}") def _dump_output_db_nonbb(db_file, dump_file, isMultiUser, pgSettings): """Dumps a database to a text file. Does not dump the artifact and attributes. Args: db_file: a pathto_File, the database file to dump dump_file: a pathto_File, the location to dump the non-blackboard database items """ conn, backup_db_file = db_connect(db_file, isMultiUser, pgSettings) id_files_table = build_id_files_table(conn.cursor(), isMultiUser) id_vs_parts_table = build_id_vs_parts_table(conn.cursor(), isMultiUser) id_vs_info_table = build_id_vs_info_table(conn.cursor(), isMultiUser) id_fs_info_table = build_id_fs_info_table(conn.cursor(), isMultiUser) id_objects_table = build_id_objects_table(conn.cursor(), isMultiUser) id_artifact_types_table = build_id_artifact_types_table(conn.cursor(), isMultiUser) id_legacy_artifact_types = build_id_legacy_artifact_types_table(conn.cursor(), isMultiUser) id_reports_table = build_id_reports_table(conn.cursor(), isMultiUser) id_images_table = build_id_image_names_table(conn.cursor(), isMultiUser) id_accounts_table = build_id_accounts_table(conn.cursor(), isMultiUser) id_obj_path_table = build_id_obj_path_table(id_files_table, id_objects_table, id_artifact_types_table, id_reports_table, id_images_table, id_accounts_table) if isMultiUser: # Use PostgreSQL os.environ['PGPASSWORD']=pgSettings.password pgDump = ["pg_dump", "--inserts", "-U", pgSettings.username, "-h", pgSettings.pgHost, "-p", pgSettings.pgPort, "-d", db_file, "-E", "utf-8", "-T", "blackboard_artifacts", "-T", "blackboard_attributes", "-f", "postgreSQLDump.sql"] subprocess.call(pgDump) postgreSQL_db = codecs.open("postgreSQLDump.sql", "r", "utf-8") # Write to the database dump with codecs.open(dump_file, "wb", "utf_8") as db_log: dump_line = '' for line in postgreSQL_db: line = line.strip('\r\n ') # Deal with pg_dump result file if (line.startswith('--') or line.lower().startswith('alter') or "pg_catalog" in line or "idle_in_transaction_session_timeout" in line or not line): # It's comment or alter statement or catalog entry or set idle entry or empty line continue elif not line.endswith(';'): # Statement not finished dump_line += line continue else: dump_line += line if 'INSERT INTO image_gallery_groups_seen' in dump_line: dump_line = '' continue; dump_line = normalize_db_entry(dump_line, id_obj_path_table, id_vs_parts_table, id_vs_info_table, id_fs_info_table, id_objects_table, id_reports_table, id_images_table, id_legacy_artifact_types, id_accounts_table) db_log.write('%s\n' % dump_line) dump_line = '' postgreSQL_db.close() else: # use Sqlite # Delete the blackboard tables conn.text_factory = lambda x: x.decode("utf-8", "ignore") conn.execute("DROP TABLE blackboard_artifacts") conn.execute("DROP TABLE blackboard_attributes") # Write to the database dump with codecs.open(dump_file, "wb", "utf_8") as db_log: for line in conn.iterdump(): if 'INSERT INTO "image_gallery_groups_seen"' in line: continue line = normalize_db_entry(line, id_obj_path_table, id_vs_parts_table, id_vs_info_table, id_fs_info_table, id_objects_table, id_reports_table, id_images_table, id_legacy_artifact_types, id_accounts_table) db_log.write('%s\n' % line) # Now sort the file srtcmdlst = ["sort", dump_file, "-o", dump_file] subprocess.call(srtcmdlst) conn.close() # cleanup the backup if backup_db_file: os.remove(backup_db_file) return id_obj_path_table def dump_output_db(db_file, dump_file, bb_dump_file, isMultiUser, pgSettings): """Dumps the given database to text files for later comparison. Args: db_file: a pathto_File, the database file to dump dump_file: a pathto_File, the location to dump the non-blackboard database items bb_dump_file: a pathto_File, the location to dump the blackboard database items """ id_obj_path_table = TskDbDiff._dump_output_db_nonbb(db_file, dump_file, isMultiUser, pgSettings) TskDbDiff._dump_output_db_bb(db_file, bb_dump_file, isMultiUser, pgSettings, id_obj_path_table) def _get_tmp_file(base, ext): time = datetime.datetime.now().time().strftime("%H%M%f") return os.path.join(os.environ['TMP'], base + time + ext) class TskDbDiffException(Exception): pass class PGSettings(object): def __init__(self, pgHost=None, pgPort=5432, user=None, password=None): self.pgHost = pgHost self.pgPort = pgPort self.username = user self.password = password def get_pgHost(self): return self.pgHost def get_pgPort(self): return self.pgPort def get_username(self): return self.username def get_password(self): return self.password def get_sqlite_table_columns(conn) -> Dict[str, List[str]]: """ Retrieves the sqlite public tables and columns from a sqlite connection. Args: conn: The sqlite connection. Returns: The mapping of table names to a list of column names in that table where the list is in ordinal value. """ cur = conn.cursor() cur.execute("SELECT name FROM sqlite_master tables WHERE tables.type='table'") tables = list([table[0] for table in cur.fetchall()]) cur.close() to_ret = {} for table in tables: cur = conn.cursor() cur.execute('SELECT name FROM pragma_table_info(?) ORDER BY cid', [table]) to_ret[table] = list([col[0] for col in cur.fetchall()]) cur.close() return to_ret def get_pg_table_columns(conn) -> Dict[str, List[str]]: """ Retrieves the postgres public tables and columns from a pg connection. Args: conn: The pg connection. Returns: The mapping of table names to a list of column names in that table where the list is in ordinal value. """ cursor = conn.cursor() cursor.execute(""" SELECT cols.table_name, cols.column_name FROM information_schema.columns cols WHERE cols.column_name IS NOT NULL AND cols.table_name IS NOT NULL AND cols.table_name IN ( SELECT tables.tablename FROM pg_catalog.pg_tables tables WHERE LOWER(schemaname) = 'public' ) ORDER by cols.table_name, cols.ordinal_position; """) mapping = {} for row in cursor: mapping.setdefault(row[0], []).append(row[1]) cursor.close() return mapping def normalize_db_entry(line, files_table, vs_parts_table, vs_info_table, fs_info_table, objects_table, reports_table, images_table, artifact_table, accounts_table): """ Make testing more consistent and reasonable by doctoring certain db entries. Args: line: a String, the line to remove the object id from. files_table: a map from object ids to file paths. """ # Sqlite statement use double quotes for table name, PostgreSQL doesn't. We check both databases results for normalization. files_index = line.find('INSERT INTO "tsk_files"') > -1 or line.find('INSERT INTO tsk_files ') > -1 path_index = line.find('INSERT INTO "tsk_files_path"') > -1 or line.find('INSERT INTO tsk_files_path ') > -1 object_index = line.find('INSERT INTO "tsk_objects"') > -1 or line.find('INSERT INTO tsk_objects ') > -1 vs_parts_index = line.find('INSERT INTO "tsk_vs_parts"') > -1 or line.find('INSERT INTO tsk_vs_parts ') > -1 report_index = line.find('INSERT INTO "reports"') > -1 or line.find('INSERT INTO reports ') > -1 layout_index = line.find('INSERT INTO "tsk_file_layout"') > -1 or line.find('INSERT INTO tsk_file_layout ') > -1 data_source_info_index = line.find('INSERT INTO "data_source_info"') > -1 or line.find('INSERT INTO data_source_info ') > -1 event_description_index = line.find('INSERT INTO "tsk_event_descriptions"') > -1 or line.find('INSERT INTO tsk_event_descriptions ') > -1 events_index = line.find('INSERT INTO "tsk_events"') > -1 or line.find('INSERT INTO tsk_events ') > -1 ingest_job_index = line.find('INSERT INTO "ingest_jobs"') > -1 or line.find('INSERT INTO ingest_jobs ') > -1 examiners_index = line.find('INSERT INTO "tsk_examiners"') > -1 or line.find('INSERT INTO tsk_examiners ') > -1 ig_groups_index = line.find('INSERT INTO "image_gallery_groups"') > -1 or line.find('INSERT INTO image_gallery_groups ') > -1 ig_groups_seen_index = line.find('INSERT INTO "image_gallery_groups_seen"') > -1 or line.find('INSERT INTO image_gallery_groups_seen ') > -1 os_account_index = line.find('INSERT INTO "tsk_os_accounts"') > -1 or line.find('INSERT INTO tsk_os_accounts') > -1 os_account_attr_index = line.find('INSERT INTO "tsk_os_account_attributes"') > -1 or line.find('INSERT INTO tsk_os_account_attributes') > -1 os_account_instances_index = line.find('INSERT INTO "tsk_os_account_instances"') > -1 or line.find('INSERT INTO tsk_os_account_instances') > -1 data_artifacts_index = line.find('INSERT INTO "tsk_data_artifacts"') > -1 or line.find('INSERT INTO tsk_data_artifacts') > -1 parens = line[line.find('(') + 1 : line.rfind(')')] no_space_parens = parens.replace(" ", "") fields_list = list(csv.reader([no_space_parens], quotechar="'"))[0] #Add back in the quotechar for values that were originally wrapped (csv reader consumes this character) fields_list_with_quotes = [] ptr = 0 for field in fields_list: if(len(field) == 0): field = "'" + field + "'" else: start = no_space_parens.find(field, ptr) if((start - 1) >= 0 and no_space_parens[start - 1] == '\''): if((start + len(field)) < len(no_space_parens) and no_space_parens[start + len(field)] == '\''): field = "'" + field + "'" fields_list_with_quotes.append(field) if(ptr > 0): #Add one for each comma that is used to separate values in the original string ptr+=1 ptr += len(field) fields_list = fields_list_with_quotes # remove object ID if files_index: # Ignore TIFF size and hash if extracted from PDFs. # See JIRA-6951 for more details. # index -3 = 3rd from the end, which is extension # index -5 = 5th from the end, which is the parent path. if fields_list[-3] == "'tif'" and fields_list[-5].endswith(".pdf/'"): fields_list[15] = "'SIZE_IGNORED'" fields_list[23] = "'MD5_IGNORED'" fields_list[24] = "'SHA256_IGNORED'" newLine = ('INSERT INTO "tsk_files" VALUES(' + ', '.join(fields_list[1:-1]) + ');') #leave off first (object id) and last (os_account_id) field # Remove object ID from Unalloc file name newLine = re.sub('Unalloc_[0-9]+_', 'Unalloc_', newLine) return newLine # remove object ID elif vs_parts_index: newLine = ('INSERT INTO "tsk_vs_parts" VALUES(' + ', '.join(fields_list[1:]) + ');') return newLine # remove group ID elif ig_groups_index: newLine = ('INSERT INTO "image_gallery_groups" VALUES(' + ', '.join(fields_list[1:]) + ');') return newLine #remove id field elif ig_groups_seen_index: # Only removing the id and group_id fields for now. May need to care about examiner_id and seen fields in future. newLine = ('INSERT INTO "image_gallery_groups_seen" VALUES(' + ', '.join(fields_list[2:]) + ');') return newLine # remove object ID elif path_index: obj_id = int(fields_list[0]) objValue = files_table[obj_id] # remove the obj_id from ModuleOutput/EmbeddedFileExtractor directory idx_pre = fields_list[1].find('EmbeddedFileExtractor') + len('EmbeddedFileExtractor') if idx_pre > -1: idx_pos = fields_list[1].find('\\', idx_pre + 2) dir_to_replace = fields_list[1][idx_pre + 1 : idx_pos] # +1 to skip the file seperator dir_to_replace = dir_to_replace[0:dir_to_replace.rfind('_')] pathValue = fields_list[1][:idx_pre+1] + dir_to_replace + fields_list[1][idx_pos:] else: pathValue = fields_list[1] # remove localhost from postgres par_obj_name multiOutput_idx = pathValue.find('ModuleOutput') if multiOutput_idx > -1: pathValue = "'" + pathValue[pathValue.find('ModuleOutput'):] #postgres par_obj_name include losthost newLine = ('INSERT INTO "tsk_files_path" VALUES(' + objValue + ', ' + pathValue + ', ' + ', '.join(fields_list[2:]) + ');') return newLine # remove object ID elif layout_index: obj_id = fields_list[0] path= files_table[int(obj_id)] newLine = ('INSERT INTO "tsk_file_layout" VALUES(' + path + ', ' + ', '.join(fields_list[1:]) + ');') # Remove object ID from Unalloc file name newLine = re.sub('Unalloc_[0-9]+_', 'Unalloc_', newLine) return newLine # remove object ID elif object_index: obj_id = fields_list[0] parent_id = fields_list[1] newLine = 'INSERT INTO "tsk_objects" VALUES(' path = None parent_path = None #if obj_id or parent_id is invalid literal, we simple return the values as it is try: obj_id = int(obj_id) if parent_id != 'NULL': parent_id = int(parent_id) except Exception as e: print(obj_id, parent_id) return line if obj_id in files_table.keys(): path = files_table[obj_id] elif obj_id in vs_parts_table.keys(): path = vs_parts_table[obj_id] elif obj_id in vs_info_table.keys(): path = vs_info_table[obj_id] elif obj_id in fs_info_table.keys(): path = fs_info_table[obj_id] elif obj_id in reports_table.keys(): path = reports_table[obj_id] # remove host name (for multi-user) and dates/times from path for reports if path is not None: if 'ModuleOutput' in path: # skip past the host name (if any) path = path[path.find('ModuleOutput'):] if 'BulkExtractor' in path or 'Smirk' in path: # chop off the last folder (which contains a date/time) path = path[:path.rfind('\\')] if 'Reports\\AutopsyTestCase HTML Report' in path: path = 'Reports\\AutopsyTestCase HTML Report' if parent_id in files_table.keys(): parent_path = files_table[parent_id] elif parent_id in vs_parts_table.keys(): parent_path = vs_parts_table[parent_id] elif parent_id in vs_info_table.keys(): parent_path = vs_info_table[parent_id] elif parent_id in fs_info_table.keys(): parent_path = fs_info_table[parent_id] elif parent_id in images_table.keys(): parent_path = images_table[parent_id] elif parent_id in accounts_table.keys(): parent_path = accounts_table[parent_id] elif parent_id == 'NULL': parent_path = "NULL" # Remove host name (for multi-user) from parent_path if parent_path is not None: if 'ModuleOutput' in parent_path: # skip past the host name (if any) parent_path = parent_path[parent_path.find('ModuleOutput'):] if path and parent_path: # Remove object ID from Unalloc file names and regripper output path = re.sub('Unalloc_[0-9]+_', 'Unalloc_', path) path = re.sub('regripper\-[0-9]+\-full', 'regripper-full', path) parent_path = re.sub('Unalloc_[0-9]+_', 'Unalloc_', parent_path) parent_path = re.sub('regripper\-[0-9]+\-full', 'regripper-full', parent_path) return newLine + path + ', ' + parent_path + ', ' + ', '.join(fields_list[2:]) + ');' else: return newLine + '"OBJECT IDS OMITTED", ' + ', '.join(fields_list[2:]) + ');' #omit parent object id and object id when we cant annonymize them # remove time-based information, ie Test_6/11/14 -> Test elif report_index: fields_list[1] = "AutopsyTestCase" fields_list[2] = "0" newLine = ('INSERT INTO "reports" VALUES(' + ','.join(fields_list[1:]) + ');') # remove report_id return newLine elif data_source_info_index: fields_list[1] = "{device id}" fields_list[4] = "{dateTime}" newLine = ('INSERT INTO "data_source_info" VALUES(' + ','.join(fields_list) + ');') return newLine elif ingest_job_index: fields_list[2] = "{host_name}" start_time = int(fields_list[3]) end_time = int(fields_list[4]) if (start_time <= end_time): fields_list[3] = "0" fields_list[4] = "0" newLine = ('INSERT INTO "ingest_jobs" VALUES(' + ','.join(fields_list) + ');') return newLine elif examiners_index: fields_list[1] = "{examiner_name}" newLine = ('INSERT INTO "tsk_examiners" VALUES(' + ','.join(fields_list) + ');') return newLine # remove all timing dependent columns from events table elif events_index: newLine = ('INSERT INTO "tsk_events" VALUES(' + ','.join(fields_list[1:2]) + ');') return newLine # remove object ids from event description table elif event_description_index: # replace object ids with information that is deterministic file_obj_id = int(fields_list[5]) object_id = int(fields_list[4]) legacy_artifact_id = 'NULL' if (fields_list[6] != 'NULL'): legacy_artifact_id = int(fields_list[6]) if file_obj_id != 'NULL' and file_obj_id in files_table.keys(): fields_list[5] = files_table[file_obj_id] if object_id != 'NULL' and object_id in files_table.keys(): fields_list[4] = files_table[object_id] if legacy_artifact_id != 'NULL' and legacy_artifact_id in artifact_table.keys(): fields_list[6] = artifact_table[legacy_artifact_id] if fields_list[1] == fields_list[2] and fields_list[1] == fields_list[3]: fields_list[1] = cleanupEventDescription(fields_list[1]) fields_list[2] = cleanupEventDescription(fields_list[2]) fields_list[3] = cleanupEventDescription(fields_list[3]) newLine = ('INSERT INTO "tsk_event_descriptions" VALUES(' + ','.join(fields_list[1:]) + ');') # remove report_id return newLine elif os_account_index: newLine = ('INSERT INTO "tsk_os_accounts" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id since value that would be substituted is in diff line already return newLine elif os_account_attr_index: #substitue the account object id for a non changing value os_account_id = int(fields_list[1]) fields_list[1] = accounts_table[os_account_id] #substitue the source object id for a non changing value source_obj_id = int(fields_list[3]) if source_obj_id in files_table.keys(): fields_list[3] = files_table[source_obj_id] elif source_obj_id in vs_parts_table.keys(): fields_list[3] = vs_parts_table[source_obj_id] elif source_obj_id in vs_info_table.keys(): fields_list[3] = vs_info_table[source_obj_id] elif source_obj_id in fs_info_table.keys(): fields_list[3] = fs_info_table[source_obj_id] elif source_obj_id in images_table.keys(): fields_list[3] = images_table[source_obj_id] elif source_obj_id in accounts_table.keys(): fields_list[3] = accounts_table[source_obj_id] elif source_obj_id == 'NULL': fields_list[3] = "NULL" newLine = ('INSERT INTO "tsk_os_account_attributes" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id return newLine elif os_account_instances_index: os_account_id = int(fields_list[1]) fields_list[1] = accounts_table[os_account_id] newLine = ('INSERT INTO "tsk_os_account_instances" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id return newLine elif data_artifacts_index: art_obj_id = int(fields_list[0]) if art_obj_id in files_table.keys(): fields_list[0] = files_table[art_obj_id] else: fields_list[0] = 'Artifact Object ID Omitted' account_obj_id = int(fields_list[1]) if account_obj_id in files_table.keys(): fields_list[1] = files_table[account_obj_id] else: fields_list[1] = 'Account Object ID Omitted' newLine = ('INSERT INTO "tsk_data_artifacts" VALUES(' + ','.join(fields_list[:]) + ');') # remove ids return newLine else: return line def cleanupEventDescription(description): test = re.search("^'\D+:\d+'$", description) if test is not None: return re.sub(":\d+", ":", description) else: return description def getAssociatedArtifactType(cur, artifact_id, isMultiUser): if isMultiUser: cur.execute("SELECT tsk_files.parent_path, blackboard_artifact_types.display_name FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id WHERE artifact_id=%s",[artifact_id]) else: cur.execute("SELECT tsk_files.parent_path, blackboard_artifact_types.display_name FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id WHERE artifact_id=?",[artifact_id]) info = cur.fetchone() return "File path: " + info[0] + " Artifact Type: " + info[1] def build_id_files_table(db_cursor, isPostgreSQL): """Build the map of object ids to file paths. Args: db_cursor: the database cursor """ # for each row in the db, take the object id, parent path, and name, then create a tuple in the dictionary # with the object id as the key and the full file path (parent + name) as the value mapping = dict([(row[0], str(row[1]) + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, parent_path, name FROM tsk_files")]) return mapping def build_id_vs_parts_table(db_cursor, isPostgreSQL): """Build the map of object ids to vs_parts. Args: db_cursor: the database cursor """ # for each row in the db, take the object id, addr, and start, then create a tuple in the dictionary # with the object id as the key and (addr + start) as the value mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, addr, start FROM tsk_vs_parts")]) return mapping def build_id_vs_info_table(db_cursor, isPostgreSQL): """Build the map of object ids to vs_info. Args: db_cursor: the database cursor """ # for each row in the db, take the object id, vs_type, and img_offset, then create a tuple in the dictionary # with the object id as the key and (vs_type + img_offset) as the value mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, vs_type, img_offset FROM tsk_vs_info")]) return mapping def build_id_fs_info_table(db_cursor, isPostgreSQL): """Build the map of object ids to fs_info. Args: db_cursor: the database cursor """ # for each row in the db, take the object id, img_offset, and fs_type, then create a tuple in the dictionary # with the object id as the key and (img_offset + fs_type) as the value mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, img_offset, fs_type FROM tsk_fs_info")]) return mapping def build_id_objects_table(db_cursor, isPostgreSQL): """Build the map of object ids to par_id. Args: db_cursor: the database cursor """ # for each row in the db, take the object id, par_obj_id, then create a tuple in the dictionary # with the object id as the key and par_obj_id, type as the value mapping = dict([(row[0], [row[1], row[2]]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT * FROM tsk_objects")]) return mapping def build_id_image_names_table(db_cursor, isPostgreSQL): """Build the map of object ids to name. Args: db_cursor: the database cursor """ # for each row in the db, take the object id and name then create a tuple in the dictionary # with the object id as the key and name, type as the value mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, name FROM tsk_image_names WHERE sequence=0")]) #data_sources which are logical file sets will be found in the files table return mapping def build_id_artifact_types_table(db_cursor, isPostgreSQL): """Build the map of object ids to artifact ids. Args: db_cursor: the database cursor """ # for each row in the db, take the object id, par_obj_id, then create a tuple in the dictionary # with the object id as the key and artifact type as the value mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT blackboard_artifacts.artifact_obj_id, blackboard_artifact_types.type_name FROM blackboard_artifacts INNER JOIN blackboard_artifact_types ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id ")]) return mapping def build_id_legacy_artifact_types_table(db_cursor, isPostgreSQL): """Build the map of legacy artifact ids to artifact type. Args: db_cursor: the database cursor """ # for each row in the db, take the legacy artifact id then create a tuple in the dictionary # with the artifact id as the key and artifact type as the value mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT blackboard_artifacts.artifact_id, blackboard_artifact_types.type_name FROM blackboard_artifacts INNER JOIN blackboard_artifact_types ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id ")]) return mapping def build_id_reports_table(db_cursor, isPostgreSQL): """Build the map of report object ids to report path. Args: db_cursor: the database cursor """ # for each row in the reports table in the db, create an obj_id -> path map mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, path FROM reports")]) return mapping def build_id_accounts_table(db_cursor, isPostgreSQL): """Build the map of object ids to OS account SIDs. Args: db_cursor: the database cursor """ # for each row in the db, take the object id and account SID then creates a tuple in the dictionary # with the object id as the key and the OS Account's SID as the value mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT os_account_obj_id, addr FROM tsk_os_accounts")]) return mapping def build_id_obj_path_table(files_table, objects_table, artifacts_table, reports_table, images_table, accounts_table): """Build the map of object ids to artifact ids. Args: files_table: obj_id, path objects_table: obj_id, par_obj_id, type artifacts_table: obj_id, artifact_type_name reports_table: obj_id, path images_table: obj_id, name accounts_table: obj_id, addr """ # make a copy of files_table and update it with new data from artifacts_table and reports_table mapping = files_table.copy() for k, v in objects_table.items(): path = "" if k not in mapping.keys(): # If the mapping table doesn't have data for obj_id if k in reports_table.keys(): # For a report we use the report path par_obj_id = v[0] if par_obj_id is not None: mapping[k] = reports_table[k] elif k in artifacts_table.keys(): # For an artifact we use it's par_obj_id's path+name plus it's artifact_type name par_obj_id = v[0] # The parent of an artifact can be a file or a report if par_obj_id in mapping.keys(): path = mapping[par_obj_id] elif par_obj_id in reports_table.keys(): path = reports_table[par_obj_id] elif par_obj_id in images_table.keys(): path = images_table[par_obj_id] mapping[k] = path + "/" + artifacts_table[k] elif k in accounts_table.keys(): # For an OS Account object ID we use its addr field which is the account SID mapping[k] = accounts_table[k] elif v[0] not in mapping.keys(): if v[0] in artifacts_table.keys(): par_obj_id = objects_table[v[0]] path = mapping[par_obj_id] mapping[k] = path + "/" + artifacts_table[v[0]] return mapping def db_connect(db_file, isMultiUser, pgSettings=None): if isMultiUser: # use PostgreSQL try: return psycopg2.connect("dbname=" + db_file + " user=" + pgSettings.username + " host=" + pgSettings.pgHost + " password=" + pgSettings.password), None except: print("Failed to connect to the database: " + db_file) else: # Sqlite # Make a copy that we can modify backup_db_file = TskDbDiff._get_tmp_file("tsk_backup_db", ".db") shutil.copy(db_file, backup_db_file) # We sometimes get situations with messed up permissions os.chmod (backup_db_file, 0o777) return sqlite3.connect(backup_db_file), backup_db_file def sql_select_execute(cursor, isPostgreSQL, sql_stmt): if isPostgreSQL: cursor.execute(sql_stmt) return cursor.fetchall() else: return cursor.execute(sql_stmt) def main(): try: sys.argv.pop(0) output_db = sys.argv.pop(0) gold_db = sys.argv.pop(0) except: print("usage: tskdbdiff [OUTPUT DB PATH] [GOLD DB PATH]") sys.exit(1) db_diff = TskDbDiff(output_db, gold_db, output_dir=".") dump_passed, bb_dump_passed = db_diff.run_diff() if dump_passed and bb_dump_passed: print("Database comparison passed.") if not dump_passed: print("Non blackboard database comparison failed.") if not bb_dump_passed: print("Blackboard database comparison failed.") sys.exit(0) if __name__ == "__main__": if sys.hexversion < 0x03000000: print("Python 3 required") sys.exit(1) main()