# Requires python3
import re
import sqlite3
import subprocess
import shutil
import os
import codecs
import datetime
import sys
from typing import Dict, List
import psycopg2
import psycopg2.extras
import socket
import csv
class TskDbDiff(object):
"""Compares two TSK/Autospy SQLite databases.
Attributes:
gold_artifacts:
autopsy_artifacts:
gold_attributes:
autopsy_attributes:
gold_objects:
autopsy_objects:
artifact_comparison:
attribute_comparision:
report_errors: a listof_listof_String, the error messages that will be
printed to screen in the run_diff method
passed: a boolean, did the diff pass?
autopsy_db_file:
gold_db_file:
"""
def __init__(self, output_db, gold_db, output_dir=None, gold_bb_dump=None, gold_dump=None, verbose=False, isMultiUser=False, pgSettings=None):
"""Constructor for TskDbDiff.
Args:
output_db_path: path to output database (non-gold standard)
gold_db_path: path to gold database
output_dir: (optional) Path to folder where generated files will be put.
gold_bb_dump: (optional) path to file where the gold blackboard dump is located
gold_dump: (optional) path to file where the gold non-blackboard dump is located
verbose: (optional) a boolean, if true, diff results are sent to stdout.
"""
self.output_db_file = output_db
self.gold_db_file = gold_db
self.output_dir = output_dir
self.gold_bb_dump = gold_bb_dump
self.gold_dump = gold_dump
self._generate_gold_dump = False
self._generate_gold_bb_dump = False
self._bb_dump_diff = ""
self._dump_diff = ""
self._bb_dump = ""
self._dump = ""
self.verbose = verbose
self.isMultiUser = isMultiUser
self.pgSettings = pgSettings
if self.isMultiUser and not self.pgSettings:
print("Missing PostgreSQL database connection settings data.")
sys.exit(1)
if self.gold_bb_dump is None:
self._generate_gold_bb_dump = True
if self.gold_dump is None:
self._generate_gold_dump = True
def run_diff(self):
"""Compare the databases.
Raises:
TskDbDiffException: if an error occurs while diffing or dumping the database
"""
self._init_diff()
id_obj_path_table = -1
# generate the gold database dumps if necessary
if self._generate_gold_dump:
id_obj_path_table = TskDbDiff._dump_output_db_nonbb(self.gold_db_file, self.gold_dump, self.isMultiUser, self.pgSettings)
if self._generate_gold_bb_dump:
TskDbDiff._dump_output_db_bb(self.gold_db_file, self.gold_bb_dump, self.isMultiUser, self.pgSettings, id_obj_path_table)
# generate the output database dumps (both DB and BB)
id_obj_path_table = TskDbDiff._dump_output_db_nonbb(self.output_db_file, self._dump, self.isMultiUser, self.pgSettings)
TskDbDiff._dump_output_db_bb(self.output_db_file, self._bb_dump, self.isMultiUser, self.pgSettings, id_obj_path_table)
# Compare non-BB
dump_diff_pass = self._diff(self._dump, self.gold_dump, self._dump_diff)
# Compare BB
bb_dump_diff_pass = self._diff(self._bb_dump, self.gold_bb_dump, self._bb_dump_diff)
self._cleanup_diff()
return dump_diff_pass, bb_dump_diff_pass
def _init_diff(self):
"""Set up the necessary files based on the arguments given at construction"""
if self.output_dir is None:
# No stored files
self._bb_dump = TskDbDiff._get_tmp_file("BlackboardDump", ".txt")
self._bb_dump_diff = TskDbDiff._get_tmp_file("BlackboardDump-Diff", ".txt")
self._dump = TskDbDiff._get_tmp_file("DBDump", ".txt")
self._dump_diff = TskDbDiff._get_tmp_file("DBDump-Diff", ".txt")
else:
self._bb_dump = os.path.join(self.output_dir, "BlackboardDump.txt")
self._bb_dump_diff = os.path.join(self.output_dir, "BlackboardDump-Diff.txt")
self._dump = os.path.join(self.output_dir, "DBDump.txt")
self._dump_diff = os.path.join(self.output_dir, "DBDump-Diff.txt")
# Sorting gold before comparing (sort behaves differently in different environments)
new_bb = TskDbDiff._get_tmp_file("GoldBlackboardDump", ".txt")
new_db = TskDbDiff._get_tmp_file("GoldDBDump", ".txt")
if self.gold_bb_dump is not None:
srtcmdlst = ["sort", self.gold_bb_dump, "-o", new_bb]
subprocess.call(srtcmdlst)
srtcmdlst = ["sort", self.gold_dump, "-o", new_db]
subprocess.call(srtcmdlst)
self.gold_bb_dump = new_bb
self.gold_dump = new_db
def _cleanup_diff(self):
if self.output_dir is None:
#cleanup temp files
os.remove(self._dump)
os.remove(self._bb_dump)
if os.path.isfile(self._dump_diff):
os.remove(self._dump_diff)
if os.path.isfile(self._bb_dump_diff):
os.remove(self._bb_dump_diff)
if self.gold_bb_dump is None:
os.remove(self.gold_bb_dump)
os.remove(self.gold_dump)
def _diff(self, output_file, gold_file, diff_path):
"""Compare two text files.
Args:
output_file: a pathto_File, the latest text file
gold_file: a pathto_File, the gold text file
diff_path: The file to write the differences to
Returns False if different
"""
if (not os.path.isfile(output_file)):
return False
if (not os.path.isfile(gold_file)):
return False
# It is faster to read the contents in and directly compare
output_data = codecs.open(output_file, "r", "utf_8").read()
gold_data = codecs.open(gold_file, "r", "utf_8").read()
if (gold_data == output_data):
return True
# If they are different, invoke 'diff'
diff_file = codecs.open(diff_path, "wb", "utf_8")
# Gold needs to be passed in as 1st arg and output as 2nd
dffcmdlst = ["diff", gold_file, output_file]
subprocess.call(dffcmdlst, stdout = diff_file)
# create file path for gold files inside output folder. In case of diff, both gold and current run files
# are available in the report output folder. Prefix Gold- is added to the filename.
gold_file_in_output_dir = output_file[:output_file.rfind("/")] + "/Gold-" + output_file[output_file.rfind("/")+1:]
shutil.copy(gold_file, gold_file_in_output_dir)
return False
def _dump_output_db_bb(db_file, bb_dump_file, isMultiUser, pgSettings, id_obj_path_table):
"""Dumps sorted text results to the given output location.
Smart method that deals with a blackboard comparison to avoid issues
with different IDs based on when artifacts were created.
Args:
db_file: a pathto_File, the output database.
bb_dump_file: a pathto_File, the sorted dump file to write to
"""
unsorted_dump = TskDbDiff._get_tmp_file("dump_data", ".txt")
if isMultiUser:
conn, unused_db = db_connect(db_file, isMultiUser, pgSettings)
artifact_cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
else: # Use Sqlite
conn = sqlite3.connect(db_file)
conn.text_factory = lambda x: x.decode("utf-8", "ignore")
conn.row_factory = sqlite3.Row
artifact_cursor = conn.cursor()
# Get the list of all artifacts (along with type and associated file)
# @@@ Could add a SORT by parent_path in here since that is how we are going to later sort it.
artifact_cursor.execute("SELECT tsk_files.parent_path, tsk_files.name, blackboard_artifact_types.display_name, blackboard_artifacts.artifact_id FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id")
database_log = codecs.open(unsorted_dump, "wb", "utf_8")
row = artifact_cursor.fetchone()
appnd = False
counter = 0
artifact_count = 0
artifact_fail = 0
# Cycle through artifacts
try:
while (row != None):
# File Name and artifact type
# Remove parent object ID from Unalloc file name
normalizedName = re.sub('^Unalloc_[0-9]+_', 'Unalloc_', row["name"])
if(row["parent_path"] != None):
database_log.write(row["parent_path"] + normalizedName + ' ')
else:
database_log.write(normalizedName + ' ')
if isMultiUser:
attribute_cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
else:
attribute_cursor = conn.cursor()
looptry = True
artifact_count += 1
try:
art_id = ""
art_id = str(row["artifact_id"])
# Get attributes for this artifact
if isMultiUser:
attribute_cursor.execute("SELECT blackboard_attributes.source, blackboard_attributes.attribute_type_id, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double FROM blackboard_attributes INNER JOIN blackboard_attribute_types ON blackboard_attributes.attribute_type_id = blackboard_attribute_types.attribute_type_id WHERE artifact_id = %s ORDER BY blackboard_attributes.source, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double", [art_id])
else:
attribute_cursor.execute("SELECT blackboard_attributes.source, blackboard_attributes.attribute_type_id, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double FROM blackboard_attributes INNER JOIN blackboard_attribute_types ON blackboard_attributes.attribute_type_id = blackboard_attribute_types.attribute_type_id WHERE artifact_id =? ORDER BY blackboard_attributes.source, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double", [art_id])
attributes = attribute_cursor.fetchall()
# Print attributes
if (len(attributes) == 0):
# @@@@ This should be
database_log.write(' \n')
row = artifact_cursor.fetchone()
continue
src = attributes[0][0]
for attr in attributes:
numvals = 0
for x in range(3, 6):
if(attr[x] != None):
numvals += 1
if(numvals > 1):
msg = "There were too many values for attribute type: " + attr["display_name"] + " for artifact with id #" + str(row["artifact_id"]) + ".\n"
if(not attr["source"] == src):
msg = "There were inconsistent sources for artifact with id #" + str(row["artifact_id"]) + ".\n"
try:
if attr["value_type"] == 0:
attr_value_as_string = str(attr["value_text"])
elif attr["value_type"] == 1:
attr_value_as_string = str(attr["value_int32"])
elif attr["value_type"] == 2:
attr_value_as_string = str(attr["value_int64"])
if attr["attribute_type_id"] == 36 and id_obj_path_table != -1 and int(attr_value_as_string) > 0: #normalize positive TSK_PATH_IDs from being object id to a path if the obj_id_path_table was generated
attr_value_as_string = id_obj_path_table[int(attr_value_as_string)]
elif attr["value_type"] == 3:
attr_value_as_string = "%20.10f" % float((attr["value_double"])) #use exact format from db schema to avoid python auto format double value to (0E-10) scientific style
elif attr["value_type"] == 4:
attr_value_as_string = "bytes"
elif attr["value_type"] == 5:
attr_value_as_string = str(attr["value_int64"])
if attr["display_name"] == "Associated Artifact":
attr_value_as_string = getAssociatedArtifactType(attribute_cursor, attr_value_as_string, isMultiUser)
patrn = re.compile("[\n\0\a\b\r\f]")
attr_value_as_string = re.sub(patrn, ' ', attr_value_as_string)
if attr["source"] == "Keyword Search" and attr["display_name"] == "Keyword Preview":
attr_value_as_string = ""
database_log.write('')
except IOError as e:
print("IO error")
raise TskDbDiffException("Unexpected IO error while writing to database log." + str(e))
except sqlite3.Error as e:
msg = "Attributes in artifact id (in output DB)# " + str(row["artifact_id"]) + " encountered an error: " + str(e) +" .\n"
print("Attributes in artifact id (in output DB)# ", str(row["artifact_id"]), " encountered an error: ", str(e))
print()
looptry = False
artifact_fail += 1
database_log.write('Error Extracting Attributes')
database_log.close()
raise TskDbDiffException(msg)
finally:
attribute_cursor.close()
# @@@@ This should be
database_log.write(' \n')
row = artifact_cursor.fetchone()
if(artifact_fail > 0):
msg ="There were " + str(artifact_count) + " artifacts and " + str(artifact_fail) + " threw an exception while loading.\n"
except Exception as e:
raise TskDbDiffException("Unexpected error while dumping blackboard database: " + str(e))
finally:
database_log.close()
artifact_cursor.close()
conn.close()
# Now sort the file
srtcmdlst = ["sort", unsorted_dump, "-o", bb_dump_file]
subprocess.call(srtcmdlst)
# for key, val in get_pg_table_columns(psycopg2.connect(dbname="jythontest1_20200414_124128", user="postgres", password="password12345")).items():
# for key, val in get_sqlite_table_columns(sqlite3.connect(r"C:\Users\gregd\Documents\cases\7500-take4\autopsy.db")).items():
# print(f"{key}: {val}")
def _dump_output_db_nonbb(db_file, dump_file, isMultiUser, pgSettings):
"""Dumps a database to a text file.
Does not dump the artifact and attributes.
Args:
db_file: a pathto_File, the database file to dump
dump_file: a pathto_File, the location to dump the non-blackboard database items
"""
conn, backup_db_file = db_connect(db_file, isMultiUser, pgSettings)
id_files_table = build_id_files_table(conn.cursor(), isMultiUser)
id_vs_parts_table = build_id_vs_parts_table(conn.cursor(), isMultiUser)
id_vs_info_table = build_id_vs_info_table(conn.cursor(), isMultiUser)
id_fs_info_table = build_id_fs_info_table(conn.cursor(), isMultiUser)
id_objects_table = build_id_objects_table(conn.cursor(), isMultiUser)
id_artifact_types_table = build_id_artifact_types_table(conn.cursor(), isMultiUser)
id_legacy_artifact_types = build_id_legacy_artifact_types_table(conn.cursor(), isMultiUser)
id_reports_table = build_id_reports_table(conn.cursor(), isMultiUser)
id_images_table = build_id_image_names_table(conn.cursor(), isMultiUser)
id_accounts_table = build_id_accounts_table(conn.cursor(), isMultiUser)
id_obj_path_table = build_id_obj_path_table(id_files_table, id_objects_table, id_artifact_types_table, id_reports_table, id_images_table, id_accounts_table)
if isMultiUser: # Use PostgreSQL
os.environ['PGPASSWORD']=pgSettings.password
pgDump = ["pg_dump", "--inserts", "-U", pgSettings.username, "-h", pgSettings.pgHost, "-p", pgSettings.pgPort, "-d", db_file, "-E", "utf-8", "-T", "blackboard_artifacts", "-T", "blackboard_attributes", "-f", "postgreSQLDump.sql"]
subprocess.call(pgDump)
postgreSQL_db = codecs.open("postgreSQLDump.sql", "r", "utf-8")
# Write to the database dump
with codecs.open(dump_file, "wb", "utf_8") as db_log:
dump_line = ''
for line in postgreSQL_db:
line = line.strip('\r\n ')
# Deal with pg_dump result file
if (line.startswith('--') or line.lower().startswith('alter') or "pg_catalog" in line or "idle_in_transaction_session_timeout" in line or not line): # It's comment or alter statement or catalog entry or set idle entry or empty line
continue
elif not line.endswith(';'): # Statement not finished
dump_line += line
continue
else:
dump_line += line
if 'INSERT INTO image_gallery_groups_seen' in dump_line:
dump_line = ''
continue;
dump_line = normalize_db_entry(dump_line, id_obj_path_table, id_vs_parts_table, id_vs_info_table, id_fs_info_table, id_objects_table, id_reports_table, id_images_table, id_legacy_artifact_types, id_accounts_table)
db_log.write('%s\n' % dump_line)
dump_line = ''
postgreSQL_db.close()
else: # use Sqlite
# Delete the blackboard tables
conn.text_factory = lambda x: x.decode("utf-8", "ignore")
conn.execute("DROP TABLE blackboard_artifacts")
conn.execute("DROP TABLE blackboard_attributes")
# Write to the database dump
with codecs.open(dump_file, "wb", "utf_8") as db_log:
for line in conn.iterdump():
if 'INSERT INTO "image_gallery_groups_seen"' in line:
continue
line = normalize_db_entry(line, id_obj_path_table, id_vs_parts_table, id_vs_info_table, id_fs_info_table, id_objects_table, id_reports_table, id_images_table, id_legacy_artifact_types, id_accounts_table)
db_log.write('%s\n' % line)
# Now sort the file
srtcmdlst = ["sort", dump_file, "-o", dump_file]
subprocess.call(srtcmdlst)
conn.close()
# cleanup the backup
if backup_db_file:
os.remove(backup_db_file)
return id_obj_path_table
def dump_output_db(db_file, dump_file, bb_dump_file, isMultiUser, pgSettings):
"""Dumps the given database to text files for later comparison.
Args:
db_file: a pathto_File, the database file to dump
dump_file: a pathto_File, the location to dump the non-blackboard database items
bb_dump_file: a pathto_File, the location to dump the blackboard database items
"""
id_obj_path_table = TskDbDiff._dump_output_db_nonbb(db_file, dump_file, isMultiUser, pgSettings)
TskDbDiff._dump_output_db_bb(db_file, bb_dump_file, isMultiUser, pgSettings, id_obj_path_table)
def _get_tmp_file(base, ext):
time = datetime.datetime.now().time().strftime("%H%M%f")
return os.path.join(os.environ['TMP'], base + time + ext)
class TskDbDiffException(Exception):
pass
class PGSettings(object):
def __init__(self, pgHost=None, pgPort=5432, user=None, password=None):
self.pgHost = pgHost
self.pgPort = pgPort
self.username = user
self.password = password
def get_pgHost(self):
return self.pgHost
def get_pgPort(self):
return self.pgPort
def get_username(self):
return self.username
def get_password(self):
return self.password
def get_sqlite_table_columns(conn) -> Dict[str, List[str]]:
"""
Retrieves the sqlite public tables and columns from a sqlite connection.
Args:
conn: The sqlite connection.
Returns: The mapping of table names to a list of column names in that table where the list is in ordinal value.
"""
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master tables WHERE tables.type='table'")
tables = list([table[0] for table in cur.fetchall()])
cur.close()
to_ret = {}
for table in tables:
cur = conn.cursor()
cur.execute('SELECT name FROM pragma_table_info(?) ORDER BY cid', [table])
to_ret[table] = list([col[0] for col in cur.fetchall()])
cur.close()
return to_ret
def get_pg_table_columns(conn) -> Dict[str, List[str]]:
"""
Retrieves the postgres public tables and columns from a pg connection.
Args:
conn: The pg connection.
Returns: The mapping of table names to a list of column names in that table where the list is in ordinal value.
"""
cursor = conn.cursor()
cursor.execute("""
SELECT cols.table_name, cols.column_name
FROM information_schema.columns cols
WHERE cols.column_name IS NOT NULL
AND cols.table_name IS NOT NULL
AND cols.table_name IN (
SELECT tables.tablename FROM pg_catalog.pg_tables tables
WHERE LOWER(schemaname) = 'public'
)
ORDER by cols.table_name, cols.ordinal_position;
""")
mapping = {}
for row in cursor:
mapping.setdefault(row[0], []).append(row[1])
cursor.close()
return mapping
def normalize_db_entry(line, files_table, vs_parts_table, vs_info_table, fs_info_table, objects_table, reports_table, images_table, artifact_table, accounts_table):
""" Make testing more consistent and reasonable by doctoring certain db entries.
Args:
line: a String, the line to remove the object id from.
files_table: a map from object ids to file paths.
"""
# Sqlite statement use double quotes for table name, PostgreSQL doesn't. We check both databases results for normalization.
files_index = line.find('INSERT INTO "tsk_files"') > -1 or line.find('INSERT INTO tsk_files ') > -1
path_index = line.find('INSERT INTO "tsk_files_path"') > -1 or line.find('INSERT INTO tsk_files_path ') > -1
object_index = line.find('INSERT INTO "tsk_objects"') > -1 or line.find('INSERT INTO tsk_objects ') > -1
vs_parts_index = line.find('INSERT INTO "tsk_vs_parts"') > -1 or line.find('INSERT INTO tsk_vs_parts ') > -1
report_index = line.find('INSERT INTO "reports"') > -1 or line.find('INSERT INTO reports ') > -1
layout_index = line.find('INSERT INTO "tsk_file_layout"') > -1 or line.find('INSERT INTO tsk_file_layout ') > -1
data_source_info_index = line.find('INSERT INTO "data_source_info"') > -1 or line.find('INSERT INTO data_source_info ') > -1
event_description_index = line.find('INSERT INTO "tsk_event_descriptions"') > -1 or line.find('INSERT INTO tsk_event_descriptions ') > -1
events_index = line.find('INSERT INTO "tsk_events"') > -1 or line.find('INSERT INTO tsk_events ') > -1
ingest_job_index = line.find('INSERT INTO "ingest_jobs"') > -1 or line.find('INSERT INTO ingest_jobs ') > -1
examiners_index = line.find('INSERT INTO "tsk_examiners"') > -1 or line.find('INSERT INTO tsk_examiners ') > -1
ig_groups_index = line.find('INSERT INTO "image_gallery_groups"') > -1 or line.find('INSERT INTO image_gallery_groups ') > -1
ig_groups_seen_index = line.find('INSERT INTO "image_gallery_groups_seen"') > -1 or line.find('INSERT INTO image_gallery_groups_seen ') > -1
os_account_index = line.find('INSERT INTO "tsk_os_accounts"') > -1 or line.find('INSERT INTO tsk_os_accounts') > -1
os_account_attr_index = line.find('INSERT INTO "tsk_os_account_attributes"') > -1 or line.find('INSERT INTO tsk_os_account_attributes') > -1
os_account_instances_index = line.find('INSERT INTO "tsk_os_account_instances"') > -1 or line.find('INSERT INTO tsk_os_account_instances') > -1
data_artifacts_index = line.find('INSERT INTO "tsk_data_artifacts"') > -1 or line.find('INSERT INTO tsk_data_artifacts') > -1
parens = line[line.find('(') + 1 : line.rfind(')')]
no_space_parens = parens.replace(" ", "")
fields_list = list(csv.reader([no_space_parens], quotechar="'"))[0]
#Add back in the quotechar for values that were originally wrapped (csv reader consumes this character)
fields_list_with_quotes = []
ptr = 0
for field in fields_list:
if(len(field) == 0):
field = "'" + field + "'"
else:
start = no_space_parens.find(field, ptr)
if((start - 1) >= 0 and no_space_parens[start - 1] == '\''):
if((start + len(field)) < len(no_space_parens) and no_space_parens[start + len(field)] == '\''):
field = "'" + field + "'"
fields_list_with_quotes.append(field)
if(ptr > 0):
#Add one for each comma that is used to separate values in the original string
ptr+=1
ptr += len(field)
fields_list = fields_list_with_quotes
# remove object ID
if files_index:
# Ignore TIFF size and hash if extracted from PDFs.
# See JIRA-6951 for more details.
# index -3 = 3rd from the end, which is extension
# index -5 = 5th from the end, which is the parent path.
if fields_list[-3] == "'tif'" and fields_list[-5].endswith(".pdf/'"):
fields_list[15] = "'SIZE_IGNORED'"
fields_list[23] = "'MD5_IGNORED'"
fields_list[24] = "'SHA256_IGNORED'"
newLine = ('INSERT INTO "tsk_files" VALUES(' + ', '.join(fields_list[1:-1]) + ');') #leave off first (object id) and last (os_account_id) field
# Remove object ID from Unalloc file name
newLine = re.sub('Unalloc_[0-9]+_', 'Unalloc_', newLine)
return newLine
# remove object ID
elif vs_parts_index:
newLine = ('INSERT INTO "tsk_vs_parts" VALUES(' + ', '.join(fields_list[1:]) + ');')
return newLine
# remove group ID
elif ig_groups_index:
newLine = ('INSERT INTO "image_gallery_groups" VALUES(' + ', '.join(fields_list[1:]) + ');')
return newLine
#remove id field
elif ig_groups_seen_index:
# Only removing the id and group_id fields for now. May need to care about examiner_id and seen fields in future.
newLine = ('INSERT INTO "image_gallery_groups_seen" VALUES(' + ', '.join(fields_list[2:]) + ');')
return newLine
# remove object ID
elif path_index:
obj_id = int(fields_list[0])
objValue = files_table[obj_id]
# remove the obj_id from ModuleOutput/EmbeddedFileExtractor directory
idx_pre = fields_list[1].find('EmbeddedFileExtractor') + len('EmbeddedFileExtractor')
if idx_pre > -1:
idx_pos = fields_list[1].find('\\', idx_pre + 2)
dir_to_replace = fields_list[1][idx_pre + 1 : idx_pos] # +1 to skip the file seperator
dir_to_replace = dir_to_replace[0:dir_to_replace.rfind('_')]
pathValue = fields_list[1][:idx_pre+1] + dir_to_replace + fields_list[1][idx_pos:]
else:
pathValue = fields_list[1]
# remove localhost from postgres par_obj_name
multiOutput_idx = pathValue.find('ModuleOutput')
if multiOutput_idx > -1:
pathValue = "'" + pathValue[pathValue.find('ModuleOutput'):] #postgres par_obj_name include losthost
newLine = ('INSERT INTO "tsk_files_path" VALUES(' + objValue + ', ' + pathValue + ', ' + ', '.join(fields_list[2:]) + ');')
return newLine
# remove object ID
elif layout_index:
obj_id = fields_list[0]
path= files_table[int(obj_id)]
newLine = ('INSERT INTO "tsk_file_layout" VALUES(' + path + ', ' + ', '.join(fields_list[1:]) + ');')
# Remove object ID from Unalloc file name
newLine = re.sub('Unalloc_[0-9]+_', 'Unalloc_', newLine)
return newLine
# remove object ID
elif object_index:
obj_id = fields_list[0]
parent_id = fields_list[1]
newLine = 'INSERT INTO "tsk_objects" VALUES('
path = None
parent_path = None
#if obj_id or parent_id is invalid literal, we simple return the values as it is
try:
obj_id = int(obj_id)
if parent_id != 'NULL':
parent_id = int(parent_id)
except Exception as e:
print(obj_id, parent_id)
return line
if obj_id in files_table.keys():
path = files_table[obj_id]
elif obj_id in vs_parts_table.keys():
path = vs_parts_table[obj_id]
elif obj_id in vs_info_table.keys():
path = vs_info_table[obj_id]
elif obj_id in fs_info_table.keys():
path = fs_info_table[obj_id]
elif obj_id in reports_table.keys():
path = reports_table[obj_id]
# remove host name (for multi-user) and dates/times from path for reports
if path is not None:
if 'ModuleOutput' in path:
# skip past the host name (if any)
path = path[path.find('ModuleOutput'):]
if 'BulkExtractor' in path or 'Smirk' in path:
# chop off the last folder (which contains a date/time)
path = path[:path.rfind('\\')]
if 'Reports\\AutopsyTestCase HTML Report' in path:
path = 'Reports\\AutopsyTestCase HTML Report'
if parent_id in files_table.keys():
parent_path = files_table[parent_id]
elif parent_id in vs_parts_table.keys():
parent_path = vs_parts_table[parent_id]
elif parent_id in vs_info_table.keys():
parent_path = vs_info_table[parent_id]
elif parent_id in fs_info_table.keys():
parent_path = fs_info_table[parent_id]
elif parent_id in images_table.keys():
parent_path = images_table[parent_id]
elif parent_id in accounts_table.keys():
parent_path = accounts_table[parent_id]
elif parent_id == 'NULL':
parent_path = "NULL"
# Remove host name (for multi-user) from parent_path
if parent_path is not None:
if 'ModuleOutput' in parent_path:
# skip past the host name (if any)
parent_path = parent_path[parent_path.find('ModuleOutput'):]
if path and parent_path:
# Remove object ID from Unalloc file names and regripper output
path = re.sub('Unalloc_[0-9]+_', 'Unalloc_', path)
path = re.sub('regripper\-[0-9]+\-full', 'regripper-full', path)
parent_path = re.sub('Unalloc_[0-9]+_', 'Unalloc_', parent_path)
parent_path = re.sub('regripper\-[0-9]+\-full', 'regripper-full', parent_path)
return newLine + path + ', ' + parent_path + ', ' + ', '.join(fields_list[2:]) + ');'
else:
return newLine + '"OBJECT IDS OMITTED", ' + ', '.join(fields_list[2:]) + ');' #omit parent object id and object id when we cant annonymize them
# remove time-based information, ie Test_6/11/14 -> Test
elif report_index:
fields_list[1] = "AutopsyTestCase"
fields_list[2] = "0"
newLine = ('INSERT INTO "reports" VALUES(' + ','.join(fields_list[1:]) + ');') # remove report_id
return newLine
elif data_source_info_index:
fields_list[1] = "{device id}"
fields_list[4] = "{dateTime}"
newLine = ('INSERT INTO "data_source_info" VALUES(' + ','.join(fields_list) + ');')
return newLine
elif ingest_job_index:
fields_list[2] = "{host_name}"
start_time = int(fields_list[3])
end_time = int(fields_list[4])
if (start_time <= end_time):
fields_list[3] = "0"
fields_list[4] = "0"
newLine = ('INSERT INTO "ingest_jobs" VALUES(' + ','.join(fields_list) + ');')
return newLine
elif examiners_index:
fields_list[1] = "{examiner_name}"
newLine = ('INSERT INTO "tsk_examiners" VALUES(' + ','.join(fields_list) + ');')
return newLine
# remove all timing dependent columns from events table
elif events_index:
newLine = ('INSERT INTO "tsk_events" VALUES(' + ','.join(fields_list[1:2]) + ');')
return newLine
# remove object ids from event description table
elif event_description_index:
# replace object ids with information that is deterministic
file_obj_id = int(fields_list[5])
object_id = int(fields_list[4])
legacy_artifact_id = 'NULL'
if (fields_list[6] != 'NULL'):
legacy_artifact_id = int(fields_list[6])
if file_obj_id != 'NULL' and file_obj_id in files_table.keys():
fields_list[5] = files_table[file_obj_id]
if object_id != 'NULL' and object_id in files_table.keys():
fields_list[4] = files_table[object_id]
if legacy_artifact_id != 'NULL' and legacy_artifact_id in artifact_table.keys():
fields_list[6] = artifact_table[legacy_artifact_id]
if fields_list[1] == fields_list[2] and fields_list[1] == fields_list[3]:
fields_list[1] = cleanupEventDescription(fields_list[1])
fields_list[2] = cleanupEventDescription(fields_list[2])
fields_list[3] = cleanupEventDescription(fields_list[3])
newLine = ('INSERT INTO "tsk_event_descriptions" VALUES(' + ','.join(fields_list[1:]) + ');') # remove report_id
return newLine
elif os_account_index:
newLine = ('INSERT INTO "tsk_os_accounts" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id since value that would be substituted is in diff line already
return newLine
elif os_account_attr_index:
#substitue the account object id for a non changing value
os_account_id = int(fields_list[1])
fields_list[1] = accounts_table[os_account_id]
#substitue the source object id for a non changing value
source_obj_id = int(fields_list[3])
if source_obj_id in files_table.keys():
fields_list[3] = files_table[source_obj_id]
elif source_obj_id in vs_parts_table.keys():
fields_list[3] = vs_parts_table[source_obj_id]
elif source_obj_id in vs_info_table.keys():
fields_list[3] = vs_info_table[source_obj_id]
elif source_obj_id in fs_info_table.keys():
fields_list[3] = fs_info_table[source_obj_id]
elif source_obj_id in images_table.keys():
fields_list[3] = images_table[source_obj_id]
elif source_obj_id in accounts_table.keys():
fields_list[3] = accounts_table[source_obj_id]
elif source_obj_id == 'NULL':
fields_list[3] = "NULL"
newLine = ('INSERT INTO "tsk_os_account_attributes" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id
return newLine
elif os_account_instances_index:
os_account_id = int(fields_list[1])
fields_list[1] = accounts_table[os_account_id]
newLine = ('INSERT INTO "tsk_os_account_instances" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id
return newLine
elif data_artifacts_index:
art_obj_id = int(fields_list[0])
if art_obj_id in files_table.keys():
fields_list[0] = files_table[art_obj_id]
else:
fields_list[0] = 'Artifact Object ID Omitted'
account_obj_id = int(fields_list[1])
if account_obj_id in files_table.keys():
fields_list[1] = files_table[account_obj_id]
else:
fields_list[1] = 'Account Object ID Omitted'
newLine = ('INSERT INTO "tsk_data_artifacts" VALUES(' + ','.join(fields_list[:]) + ');') # remove ids
return newLine
else:
return line
def cleanupEventDescription(description):
test = re.search("^'\D+:\d+'$", description)
if test is not None:
return re.sub(":\d+", ":", description)
else:
return description
def getAssociatedArtifactType(cur, artifact_id, isMultiUser):
if isMultiUser:
cur.execute("SELECT tsk_files.parent_path, blackboard_artifact_types.display_name FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id WHERE artifact_id=%s",[artifact_id])
else:
cur.execute("SELECT tsk_files.parent_path, blackboard_artifact_types.display_name FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id WHERE artifact_id=?",[artifact_id])
info = cur.fetchone()
return "File path: " + info[0] + " Artifact Type: " + info[1]
def build_id_files_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to file paths.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id, parent path, and name, then create a tuple in the dictionary
# with the object id as the key and the full file path (parent + name) as the value
mapping = dict([(row[0], str(row[1]) + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, parent_path, name FROM tsk_files")])
return mapping
def build_id_vs_parts_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to vs_parts.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id, addr, and start, then create a tuple in the dictionary
# with the object id as the key and (addr + start) as the value
mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, addr, start FROM tsk_vs_parts")])
return mapping
def build_id_vs_info_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to vs_info.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id, vs_type, and img_offset, then create a tuple in the dictionary
# with the object id as the key and (vs_type + img_offset) as the value
mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, vs_type, img_offset FROM tsk_vs_info")])
return mapping
def build_id_fs_info_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to fs_info.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id, img_offset, and fs_type, then create a tuple in the dictionary
# with the object id as the key and (img_offset + fs_type) as the value
mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, img_offset, fs_type FROM tsk_fs_info")])
return mapping
def build_id_objects_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to par_id.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id, par_obj_id, then create a tuple in the dictionary
# with the object id as the key and par_obj_id, type as the value
mapping = dict([(row[0], [row[1], row[2]]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT * FROM tsk_objects")])
return mapping
def build_id_image_names_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to name.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id and name then create a tuple in the dictionary
# with the object id as the key and name, type as the value
mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, name FROM tsk_image_names WHERE sequence=0")])
#data_sources which are logical file sets will be found in the files table
return mapping
def build_id_artifact_types_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to artifact ids.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id, par_obj_id, then create a tuple in the dictionary
# with the object id as the key and artifact type as the value
mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT blackboard_artifacts.artifact_obj_id, blackboard_artifact_types.type_name FROM blackboard_artifacts INNER JOIN blackboard_artifact_types ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id ")])
return mapping
def build_id_legacy_artifact_types_table(db_cursor, isPostgreSQL):
"""Build the map of legacy artifact ids to artifact type.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the legacy artifact id then create a tuple in the dictionary
# with the artifact id as the key and artifact type as the value
mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT blackboard_artifacts.artifact_id, blackboard_artifact_types.type_name FROM blackboard_artifacts INNER JOIN blackboard_artifact_types ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id ")])
return mapping
def build_id_reports_table(db_cursor, isPostgreSQL):
"""Build the map of report object ids to report path.
Args:
db_cursor: the database cursor
"""
# for each row in the reports table in the db, create an obj_id -> path map
mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, path FROM reports")])
return mapping
def build_id_accounts_table(db_cursor, isPostgreSQL):
"""Build the map of object ids to OS account SIDs.
Args:
db_cursor: the database cursor
"""
# for each row in the db, take the object id and account SID then creates a tuple in the dictionary
# with the object id as the key and the OS Account's SID as the value
mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT os_account_obj_id, addr FROM tsk_os_accounts")])
return mapping
def build_id_obj_path_table(files_table, objects_table, artifacts_table, reports_table, images_table, accounts_table):
"""Build the map of object ids to artifact ids.
Args:
files_table: obj_id, path
objects_table: obj_id, par_obj_id, type
artifacts_table: obj_id, artifact_type_name
reports_table: obj_id, path
images_table: obj_id, name
accounts_table: obj_id, addr
"""
# make a copy of files_table and update it with new data from artifacts_table and reports_table
mapping = files_table.copy()
for k, v in objects_table.items():
path = ""
if k not in mapping.keys(): # If the mapping table doesn't have data for obj_id
if k in reports_table.keys(): # For a report we use the report path
par_obj_id = v[0]
if par_obj_id is not None:
mapping[k] = reports_table[k]
elif k in artifacts_table.keys(): # For an artifact we use it's par_obj_id's path+name plus it's artifact_type name
par_obj_id = v[0] # The parent of an artifact can be a file or a report
if par_obj_id in mapping.keys():
path = mapping[par_obj_id]
elif par_obj_id in reports_table.keys():
path = reports_table[par_obj_id]
elif par_obj_id in images_table.keys():
path = images_table[par_obj_id]
mapping[k] = path + "/" + artifacts_table[k]
elif k in accounts_table.keys(): # For an OS Account object ID we use its addr field which is the account SID
mapping[k] = accounts_table[k]
elif v[0] not in mapping.keys():
if v[0] in artifacts_table.keys():
par_obj_id = objects_table[v[0]]
path = mapping[par_obj_id]
mapping[k] = path + "/" + artifacts_table[v[0]]
return mapping
def db_connect(db_file, isMultiUser, pgSettings=None):
if isMultiUser: # use PostgreSQL
try:
return psycopg2.connect("dbname=" + db_file + " user=" + pgSettings.username + " host=" + pgSettings.pgHost + " password=" + pgSettings.password), None
except:
print("Failed to connect to the database: " + db_file)
else: # Sqlite
# Make a copy that we can modify
backup_db_file = TskDbDiff._get_tmp_file("tsk_backup_db", ".db")
shutil.copy(db_file, backup_db_file)
# We sometimes get situations with messed up permissions
os.chmod (backup_db_file, 0o777)
return sqlite3.connect(backup_db_file), backup_db_file
def sql_select_execute(cursor, isPostgreSQL, sql_stmt):
if isPostgreSQL:
cursor.execute(sql_stmt)
return cursor.fetchall()
else:
return cursor.execute(sql_stmt)
def main():
try:
sys.argv.pop(0)
output_db = sys.argv.pop(0)
gold_db = sys.argv.pop(0)
except:
print("usage: tskdbdiff [OUTPUT DB PATH] [GOLD DB PATH]")
sys.exit(1)
db_diff = TskDbDiff(output_db, gold_db, output_dir=".")
dump_passed, bb_dump_passed = db_diff.run_diff()
if dump_passed and bb_dump_passed:
print("Database comparison passed.")
if not dump_passed:
print("Non blackboard database comparison failed.")
if not bb_dump_passed:
print("Blackboard database comparison failed.")
sys.exit(0)
if __name__ == "__main__":
if sys.hexversion < 0x03000000:
print("Python 3 required")
sys.exit(1)
main()