# Requires python3

import re
import sqlite3
import subprocess
import shutil
import os
import codecs
import datetime
import sys
from typing import Dict, List

import psycopg2
import psycopg2.extras
import socket
import csv

class TskDbDiff(object):
    """Compares two TSK/Autospy SQLite databases.

    Attributes:
        gold_artifacts:
        autopsy_artifacts:
        gold_attributes:
        autopsy_attributes:
        gold_objects:
        autopsy_objects:
        artifact_comparison:
        attribute_comparision:
        report_errors: a listof_listof_String, the error messages that will be
        printed to screen in the run_diff method
        passed: a boolean, did the diff pass?
        autopsy_db_file:
        gold_db_file:
    """
    def __init__(self, output_db, gold_db, output_dir=None, gold_bb_dump=None, gold_dump=None, verbose=False, isMultiUser=False, pgSettings=None):
        """Constructor for TskDbDiff.

        Args:
            output_db_path: path to output database (non-gold standard)
            gold_db_path: path to gold database
            output_dir: (optional) Path to folder where generated files will be put.
            gold_bb_dump: (optional) path to file where the gold blackboard dump is located
            gold_dump: (optional) path to file where the gold non-blackboard dump is located
            verbose: (optional) a boolean, if true, diff results are sent to stdout. 
        """

        self.output_db_file = output_db
        self.gold_db_file = gold_db
        self.output_dir = output_dir
        self.gold_bb_dump = gold_bb_dump
        self.gold_dump = gold_dump
        self._generate_gold_dump = False        
        self._generate_gold_bb_dump = False
        self._bb_dump_diff = ""
        self._dump_diff = ""
        self._bb_dump = ""
        self._dump = ""
        self.verbose = verbose
        self.isMultiUser = isMultiUser
        self.pgSettings = pgSettings

        if self.isMultiUser and not self.pgSettings:
            print("Missing PostgreSQL database connection settings data.")
            sys.exit(1)

        if self.gold_bb_dump is None:
            self._generate_gold_bb_dump = True
        if self.gold_dump is None:
            self._generate_gold_dump = True

    def run_diff(self):
        """Compare the databases.

        Raises:
            TskDbDiffException: if an error occurs while diffing or dumping the database
        """

        self._init_diff()
        id_obj_path_table = -1
        # generate the gold database dumps if necessary     
        if self._generate_gold_dump:       
            id_obj_path_table = TskDbDiff._dump_output_db_nonbb(self.gold_db_file, self.gold_dump, self.isMultiUser, self.pgSettings)     
        if self._generate_gold_bb_dump:        
            TskDbDiff._dump_output_db_bb(self.gold_db_file, self.gold_bb_dump, self.isMultiUser, self.pgSettings, id_obj_path_table)

        # generate the output database dumps (both DB and BB)
        id_obj_path_table = TskDbDiff._dump_output_db_nonbb(self.output_db_file, self._dump, self.isMultiUser, self.pgSettings)
        TskDbDiff._dump_output_db_bb(self.output_db_file, self._bb_dump, self.isMultiUser, self.pgSettings, id_obj_path_table)

        # Compare non-BB
        dump_diff_pass = self._diff(self._dump, self.gold_dump, self._dump_diff)

        # Compare BB
        bb_dump_diff_pass = self._diff(self._bb_dump, self.gold_bb_dump, self._bb_dump_diff)

        self._cleanup_diff()
        return dump_diff_pass, bb_dump_diff_pass


    def _init_diff(self):
        """Set up the necessary files based on the arguments given at construction"""
        if self.output_dir is None:
            # No stored files
            self._bb_dump = TskDbDiff._get_tmp_file("BlackboardDump", ".txt")
            self._bb_dump_diff = TskDbDiff._get_tmp_file("BlackboardDump-Diff", ".txt")
            self._dump = TskDbDiff._get_tmp_file("DBDump", ".txt")
            self._dump_diff = TskDbDiff._get_tmp_file("DBDump-Diff", ".txt")
        else:
            self._bb_dump = os.path.join(self.output_dir, "BlackboardDump.txt")
            self._bb_dump_diff = os.path.join(self.output_dir, "BlackboardDump-Diff.txt")
            self._dump = os.path.join(self.output_dir, "DBDump.txt")
            self._dump_diff = os.path.join(self.output_dir, "DBDump-Diff.txt")

        # Sorting gold before comparing (sort behaves differently in different environments)
        new_bb = TskDbDiff._get_tmp_file("GoldBlackboardDump", ".txt")
        new_db = TskDbDiff._get_tmp_file("GoldDBDump", ".txt")
        if self.gold_bb_dump is not None:
            srtcmdlst = ["sort", self.gold_bb_dump, "-o", new_bb]
            subprocess.call(srtcmdlst)
            srtcmdlst = ["sort", self.gold_dump, "-o", new_db]
            subprocess.call(srtcmdlst)
        self.gold_bb_dump = new_bb
        self.gold_dump = new_db


    def _cleanup_diff(self):
        if self.output_dir is None:
            #cleanup temp files
            os.remove(self._dump)
            os.remove(self._bb_dump)
            if os.path.isfile(self._dump_diff):
                os.remove(self._dump_diff)
            if os.path.isfile(self._bb_dump_diff):
                os.remove(self._bb_dump_diff)

        if self.gold_bb_dump is None:
            os.remove(self.gold_bb_dump)
            os.remove(self.gold_dump)


    def _diff(self, output_file, gold_file, diff_path):
        """Compare two text files.

        Args:
            output_file: a pathto_File, the latest text file
            gold_file: a pathto_File, the gold text file
            diff_path: The file to write the differences to
        Returns False if different
        """

        if (not os.path.isfile(output_file)):
            return False

        if (not os.path.isfile(gold_file)):
            return False

        # It is faster to read the contents in and directly compare
        output_data = codecs.open(output_file, "r", "utf_8").read()
        gold_data = codecs.open(gold_file, "r", "utf_8").read()
        if (gold_data == output_data):
            return True

        # If they are different, invoke 'diff'
        diff_file = codecs.open(diff_path, "wb", "utf_8")
        # Gold needs to be passed in as 1st arg and output as 2nd
        dffcmdlst = ["diff", gold_file, output_file]
        subprocess.call(dffcmdlst, stdout = diff_file)

        # create file path for gold files inside output folder. In case of diff, both gold and current run files
        # are available in the report output folder. Prefix Gold- is added to the filename.
        gold_file_in_output_dir = output_file[:output_file.rfind("/")] + "/Gold-" + output_file[output_file.rfind("/")+1:]
        shutil.copy(gold_file, gold_file_in_output_dir)

        return False


    def _dump_output_db_bb(db_file, bb_dump_file, isMultiUser, pgSettings, id_obj_path_table):
        """Dumps sorted text results to the given output location.

        Smart method that deals with a blackboard comparison to avoid issues
        with different IDs based on when artifacts were created.

        Args:
            db_file: a pathto_File, the output database.
            bb_dump_file: a pathto_File, the sorted dump file to write to
        """

        unsorted_dump = TskDbDiff._get_tmp_file("dump_data", ".txt")
        if isMultiUser:
            conn, unused_db = db_connect(db_file, isMultiUser, pgSettings)
            artifact_cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
        else: # Use Sqlite
            conn = sqlite3.connect(db_file)
            conn.text_factory = lambda x: x.decode("utf-8", "ignore")
            conn.row_factory = sqlite3.Row
            artifact_cursor = conn.cursor()
        # Get the list of all artifacts (along with type and associated file)
        # @@@ Could add a SORT by parent_path in here since that is how we are going to later sort it.
        artifact_cursor.execute("SELECT tsk_files.parent_path, tsk_files.name, blackboard_artifact_types.display_name, blackboard_artifacts.artifact_id FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id")
        database_log = codecs.open(unsorted_dump, "wb", "utf_8")
        row = artifact_cursor.fetchone()
        appnd = False
        counter = 0
        artifact_count = 0
        artifact_fail = 0

        # Cycle through artifacts
        try:
            while (row != None):

                # File Name and artifact type
                # Remove parent object ID from Unalloc file name
                normalizedName = re.sub('^Unalloc_[0-9]+_', 'Unalloc_', row["name"])
                if(row["parent_path"] != None):
                    database_log.write(row["parent_path"] + normalizedName + ' <artifact type="' + row["display_name"] + '" > ')
                else:
                    database_log.write(normalizedName + ' <artifact type="' + row["display_name"] + '" > ')

                if isMultiUser:
                    attribute_cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
                else:
                    attribute_cursor = conn.cursor()
                looptry = True
                artifact_count += 1
                try:
                    art_id = ""
                    art_id = str(row["artifact_id"])
                  
                    # Get attributes for this artifact
                    if isMultiUser:
                        attribute_cursor.execute("SELECT blackboard_attributes.source, blackboard_attributes.attribute_type_id, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double FROM blackboard_attributes INNER JOIN blackboard_attribute_types ON blackboard_attributes.attribute_type_id = blackboard_attribute_types.attribute_type_id WHERE artifact_id = %s ORDER BY blackboard_attributes.source, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double", [art_id])
                    else:
                        attribute_cursor.execute("SELECT blackboard_attributes.source, blackboard_attributes.attribute_type_id, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double FROM blackboard_attributes INNER JOIN blackboard_attribute_types ON blackboard_attributes.attribute_type_id = blackboard_attribute_types.attribute_type_id WHERE artifact_id =? ORDER BY blackboard_attributes.source, blackboard_attribute_types.display_name, blackboard_attributes.value_type, blackboard_attributes.value_text, blackboard_attributes.value_int32, blackboard_attributes.value_int64, blackboard_attributes.value_double", [art_id])
                    
                    attributes = attribute_cursor.fetchall()
                
                    # Print attributes
                    if (len(attributes) == 0):
                       # @@@@ This should be </artifact> 
                       database_log.write(' <artifact/>\n')
                       row = artifact_cursor.fetchone()
                       continue

                    src = attributes[0][0]
                    for attr in attributes:
                        numvals = 0
                        for x in range(3, 6):
                            if(attr[x] != None):
                                numvals += 1
                        if(numvals > 1):
                            msg = "There were too many values for attribute type: " + attr["display_name"] + " for artifact with id #" + str(row["artifact_id"]) + ".\n"

                        if(not attr["source"] == src):
                            msg = "There were inconsistent sources for artifact with id #" + str(row["artifact_id"]) + ".\n"

                        try:
                            if attr["value_type"] == 0:
                                attr_value_as_string = str(attr["value_text"])                        
                            elif attr["value_type"] == 1:
                                attr_value_as_string = str(attr["value_int32"])                        
                            elif attr["value_type"] == 2:
                                attr_value_as_string = str(attr["value_int64"])
                                if attr["attribute_type_id"]  == 36 and id_obj_path_table != -1 and int(attr_value_as_string) > 0: #normalize positive TSK_PATH_IDs from being object id to a path if the obj_id_path_table was generated
                                    attr_value_as_string = id_obj_path_table[int(attr_value_as_string)]
                            elif attr["value_type"] == 3:
                                attr_value_as_string = "%20.10f" % float((attr["value_double"])) #use exact format from db schema to avoid python auto format double value to (0E-10) scientific style                       
                            elif attr["value_type"] == 4:
                                attr_value_as_string = "bytes"                        
                            elif attr["value_type"] == 5:
                                attr_value_as_string = str(attr["value_int64"])                        
                            if attr["display_name"] == "Associated Artifact":
                                attr_value_as_string = getAssociatedArtifactType(attribute_cursor, attr_value_as_string, isMultiUser)                            
                            patrn = re.compile("[\n\0\a\b\r\f]")
                            attr_value_as_string = re.sub(patrn, ' ', attr_value_as_string)
                            if attr["source"] == "Keyword Search" and attr["display_name"] == "Keyword Preview":
                                attr_value_as_string = "<Keyword Preview placeholder>"
                            database_log.write('<attribute source="' + attr["source"] + '" type="' + attr["display_name"] + '" value="' + attr_value_as_string + '" />')
                        except IOError as e:
                            print("IO error")
                            raise TskDbDiffException("Unexpected IO error while writing to database log." + str(e))

                except sqlite3.Error as e:
                    msg = "Attributes in artifact id (in output DB)# " + str(row["artifact_id"]) + " encountered an error: " + str(e) +" .\n"
                    print("Attributes in artifact id (in output DB)# ", str(row["artifact_id"]), " encountered an error: ", str(e))
                    print() 
                    looptry = False
                    artifact_fail += 1
                    database_log.write('Error Extracting Attributes')
                    database_log.close()
                    raise TskDbDiffException(msg)
                finally:
                    attribute_cursor.close()

               
                # @@@@ This should be </artifact> 
                database_log.write(' <artifact/>\n')
                row = artifact_cursor.fetchone()

            if(artifact_fail > 0):
                msg ="There were " + str(artifact_count) + " artifacts and " + str(artifact_fail) + " threw an exception while loading.\n"
        except Exception as e:
            raise TskDbDiffException("Unexpected error while dumping blackboard database: " + str(e))
        finally:
            database_log.close()
            artifact_cursor.close()
            conn.close()
        
        # Now sort the file
        srtcmdlst = ["sort", unsorted_dump, "-o", bb_dump_file]
        subprocess.call(srtcmdlst)


    # for key, val in get_pg_table_columns(psycopg2.connect(dbname="jythontest1_20200414_124128", user="postgres", password="password12345")).items():
    # for key, val in get_sqlite_table_columns(sqlite3.connect(r"C:\Users\gregd\Documents\cases\7500-take4\autopsy.db")).items():
    #    print(f"{key}: {val}")


    def _dump_output_db_nonbb(db_file, dump_file, isMultiUser, pgSettings):
        """Dumps a database to a text file.

        Does not dump the artifact and attributes.

        Args:
            db_file: a pathto_File, the database file to dump
            dump_file: a pathto_File, the location to dump the non-blackboard database items
        """

        conn, backup_db_file = db_connect(db_file, isMultiUser, pgSettings)
        id_files_table = build_id_files_table(conn.cursor(), isMultiUser)
        id_vs_parts_table = build_id_vs_parts_table(conn.cursor(), isMultiUser)
        id_vs_info_table = build_id_vs_info_table(conn.cursor(), isMultiUser)
        id_fs_info_table = build_id_fs_info_table(conn.cursor(), isMultiUser)
        id_objects_table = build_id_objects_table(conn.cursor(), isMultiUser)
        id_artifact_types_table = build_id_artifact_types_table(conn.cursor(), isMultiUser)
        id_legacy_artifact_types = build_id_legacy_artifact_types_table(conn.cursor(), isMultiUser)
        id_reports_table = build_id_reports_table(conn.cursor(), isMultiUser)
        id_images_table = build_id_image_names_table(conn.cursor(), isMultiUser)
        id_accounts_table = build_id_accounts_table(conn.cursor(), isMultiUser)
        id_obj_path_table = build_id_obj_path_table(id_files_table, id_objects_table, id_artifact_types_table, id_reports_table, id_images_table, id_accounts_table)

        if isMultiUser: # Use PostgreSQL
            os.environ['PGPASSWORD']=pgSettings.password
            pgDump = ["pg_dump", "--inserts", "-U", pgSettings.username, "-h", pgSettings.pgHost, "-p", pgSettings.pgPort, "-d", db_file, "-E", "utf-8", "-T", "blackboard_artifacts", "-T", "blackboard_attributes", "-f", "postgreSQLDump.sql"]
            subprocess.call(pgDump)
            postgreSQL_db = codecs.open("postgreSQLDump.sql", "r", "utf-8")
            # Write to the database dump
            with codecs.open(dump_file, "wb", "utf_8") as db_log:
                dump_line = ''
                for line in postgreSQL_db:
                    line = line.strip('\r\n ')
                    # Deal with pg_dump result file
                    if (line.startswith('--') or line.lower().startswith('alter') or "pg_catalog" in line or "idle_in_transaction_session_timeout" in line or not line): # It's comment or alter statement or catalog entry or set idle entry or empty line
                        continue
                    elif not line.endswith(';'): # Statement not finished
                        dump_line += line
                        continue
                    else:
                        dump_line += line
                    if 'INSERT INTO image_gallery_groups_seen' in dump_line:
                        dump_line = ''
                        continue;
                    dump_line = normalize_db_entry(dump_line, id_obj_path_table, id_vs_parts_table, id_vs_info_table, id_fs_info_table, id_objects_table, id_reports_table, id_images_table, id_legacy_artifact_types, id_accounts_table)
                    db_log.write('%s\n' % dump_line)
                    dump_line = ''
            postgreSQL_db.close()
        else: # use Sqlite
            # Delete the blackboard tables
            conn.text_factory = lambda x: x.decode("utf-8", "ignore")
            conn.execute("DROP TABLE blackboard_artifacts")
            conn.execute("DROP TABLE blackboard_attributes")
            # Write to the database dump
            with codecs.open(dump_file, "wb", "utf_8") as db_log:
                for line in conn.iterdump():
                    if 'INSERT INTO "image_gallery_groups_seen"' in line:
                        continue
                    line = normalize_db_entry(line, id_obj_path_table, id_vs_parts_table, id_vs_info_table, id_fs_info_table, id_objects_table, id_reports_table, id_images_table, id_legacy_artifact_types, id_accounts_table)
                    db_log.write('%s\n' % line)
        # Now sort the file  
        srtcmdlst = ["sort", dump_file, "-o", dump_file]
        subprocess.call(srtcmdlst)

        conn.close()
        # cleanup the backup
        if backup_db_file:
            os.remove(backup_db_file)
        return id_obj_path_table


    def dump_output_db(db_file, dump_file, bb_dump_file, isMultiUser, pgSettings):
        """Dumps the given database to text files for later comparison.

        Args:
            db_file: a pathto_File, the database file to dump
            dump_file: a pathto_File, the location to dump the non-blackboard database items
            bb_dump_file: a pathto_File, the location to dump the blackboard database items
        """
        id_obj_path_table = TskDbDiff._dump_output_db_nonbb(db_file, dump_file, isMultiUser, pgSettings)
        TskDbDiff._dump_output_db_bb(db_file, bb_dump_file, isMultiUser, pgSettings, id_obj_path_table)


    def _get_tmp_file(base, ext):
        time = datetime.datetime.now().time().strftime("%H%M%f")
        return os.path.join(os.environ['TMP'], base + time + ext)


class TskDbDiffException(Exception):
    pass

class PGSettings(object):
    def __init__(self, pgHost=None, pgPort=5432, user=None, password=None):
        self.pgHost = pgHost
        self.pgPort = pgPort
        self.username = user
        self.password = password

    def get_pgHost(self):
        return self.pgHost

    def get_pgPort(self):
        return self.pgPort

    def get_username(self):
        return self.username

    def get_password(self):
        return self.password


def get_sqlite_table_columns(conn) -> Dict[str, List[str]]:
    """
    Retrieves the sqlite public tables and columns from a sqlite connection.
    Args:
        conn: The sqlite connection.

    Returns: The mapping of table names to a list of column names in that table where the list is in ordinal value.
    """
    cur = conn.cursor()
    cur.execute("SELECT name FROM sqlite_master tables WHERE tables.type='table'")
    tables = list([table[0] for table in cur.fetchall()])
    cur.close()

    to_ret = {}
    for table in tables:
        cur = conn.cursor()
        cur.execute('SELECT name FROM pragma_table_info(?) ORDER BY cid', [table])
        to_ret[table] = list([col[0] for col in cur.fetchall()])
        cur.close()

    return to_ret


def get_pg_table_columns(conn) -> Dict[str, List[str]]:
    """
    Retrieves the postgres public tables and columns from a pg connection.
    Args:
        conn: The pg connection.

    Returns: The mapping of table names to a list of column names in that table where the list is in ordinal value.
    """
    cursor = conn.cursor()
    cursor.execute("""
    SELECT cols.table_name, cols.column_name
      FROM information_schema.columns cols
      WHERE cols.column_name IS NOT NULL
      AND cols.table_name IS NOT NULL
      AND cols.table_name IN (
        SELECT tables.tablename FROM pg_catalog.pg_tables tables
        WHERE LOWER(schemaname) = 'public'
      )
    ORDER by cols.table_name, cols.ordinal_position;
    """)
    mapping = {}
    for row in cursor:
        mapping.setdefault(row[0], []).append(row[1])

    cursor.close()
    return mapping


def normalize_db_entry(line, files_table, vs_parts_table, vs_info_table, fs_info_table, objects_table, reports_table, images_table, artifact_table, accounts_table):
    """ Make testing more consistent and reasonable by doctoring certain db entries.

    Args:
        line: a String, the line to remove the object id from.
        files_table: a map from object ids to file paths.
    """

    # Sqlite statement use double quotes for table name, PostgreSQL doesn't. We check both databases results for normalization.
    files_index = line.find('INSERT INTO "tsk_files"') > -1 or line.find('INSERT INTO tsk_files ') > -1
    path_index = line.find('INSERT INTO "tsk_files_path"') > -1 or line.find('INSERT INTO tsk_files_path ') > -1
    object_index = line.find('INSERT INTO "tsk_objects"') > -1 or line.find('INSERT INTO tsk_objects ') > -1
    vs_parts_index = line.find('INSERT INTO "tsk_vs_parts"') > -1 or line.find('INSERT INTO tsk_vs_parts ') > -1
    report_index = line.find('INSERT INTO "reports"') > -1 or line.find('INSERT INTO reports ') > -1
    layout_index = line.find('INSERT INTO "tsk_file_layout"') > -1 or line.find('INSERT INTO tsk_file_layout ') > -1
    data_source_info_index = line.find('INSERT INTO "data_source_info"') > -1 or line.find('INSERT INTO data_source_info ') > -1
    event_description_index = line.find('INSERT INTO "tsk_event_descriptions"') > -1 or line.find('INSERT INTO tsk_event_descriptions ') > -1
    events_index = line.find('INSERT INTO "tsk_events"') > -1 or line.find('INSERT INTO tsk_events ') > -1
    ingest_job_index = line.find('INSERT INTO "ingest_jobs"') > -1 or line.find('INSERT INTO ingest_jobs ') > -1
    examiners_index = line.find('INSERT INTO "tsk_examiners"') > -1 or line.find('INSERT INTO tsk_examiners ') > -1
    ig_groups_index = line.find('INSERT INTO "image_gallery_groups"') > -1 or line.find('INSERT INTO image_gallery_groups ') > -1
    ig_groups_seen_index = line.find('INSERT INTO "image_gallery_groups_seen"') > -1 or line.find('INSERT INTO image_gallery_groups_seen ') > -1
    os_account_index = line.find('INSERT INTO "tsk_os_accounts"') > -1 or line.find('INSERT INTO tsk_os_accounts') > -1
    os_account_attr_index = line.find('INSERT INTO "tsk_os_account_attributes"') > -1 or line.find('INSERT INTO tsk_os_account_attributes') > -1
    os_account_instances_index = line.find('INSERT INTO "tsk_os_account_instances"') > -1 or line.find('INSERT INTO tsk_os_account_instances') > -1
    data_artifacts_index = line.find('INSERT INTO "tsk_data_artifacts"') > -1 or line.find('INSERT INTO tsk_data_artifacts') > -1
    
    parens = line[line.find('(') + 1 : line.rfind(')')]
    no_space_parens = parens.replace(" ", "")
    fields_list = list(csv.reader([no_space_parens], quotechar="'"))[0]
    #Add back in the quotechar for values that were originally wrapped (csv reader consumes this character)
    fields_list_with_quotes = []
    ptr = 0
    for field in fields_list:
        if(len(field) == 0):
            field = "'" + field + "'"
        else:
            start = no_space_parens.find(field, ptr)
            if((start - 1) >= 0 and no_space_parens[start - 1] == '\''):
                if((start + len(field)) < len(no_space_parens) and no_space_parens[start + len(field)] == '\''):
                    field = "'" + field + "'"
        fields_list_with_quotes.append(field)
        if(ptr > 0):
            #Add one for each comma that is used to separate values in the original string
            ptr+=1
        ptr += len(field)

    fields_list = fields_list_with_quotes

    # remove object ID
    if files_index:
    
        # Ignore TIFF size and hash if extracted from PDFs.
        # See JIRA-6951 for more details.
        # index -3 = 3rd from the end, which is extension
        # index -5 = 5th from the end, which is the parent path.
        if fields_list[-3] == "'tif'" and fields_list[-5].endswith(".pdf/'"):
            fields_list[15] = "'SIZE_IGNORED'"
            fields_list[23] = "'MD5_IGNORED'"
            fields_list[24] = "'SHA256_IGNORED'"
        newLine = ('INSERT INTO "tsk_files" VALUES(' + ', '.join(fields_list[1:-1]) + ');') #leave off first (object id) and last (os_account_id) field 
        # Remove object ID from Unalloc file name
        newLine = re.sub('Unalloc_[0-9]+_', 'Unalloc_', newLine)
        return newLine
    # remove object ID
    elif vs_parts_index:
        newLine = ('INSERT INTO "tsk_vs_parts" VALUES(' + ', '.join(fields_list[1:]) + ');') 
        return newLine
    # remove group ID
    elif ig_groups_index:
        newLine = ('INSERT INTO "image_gallery_groups" VALUES(' + ', '.join(fields_list[1:]) + ');') 
        return newLine
    #remove id field
    elif ig_groups_seen_index:
        # Only removing the id and group_id fields for now. May need to care about examiner_id and seen fields in future.
        newLine = ('INSERT INTO "image_gallery_groups_seen" VALUES(' + ', '.join(fields_list[2:]) + ');') 
        return newLine    
    # remove object ID
    elif path_index:
        obj_id = int(fields_list[0])
        objValue = files_table[obj_id]
        # remove the obj_id from ModuleOutput/EmbeddedFileExtractor directory
        idx_pre = fields_list[1].find('EmbeddedFileExtractor') + len('EmbeddedFileExtractor')
        if idx_pre > -1:
            idx_pos =  fields_list[1].find('\\', idx_pre + 2)
            dir_to_replace = fields_list[1][idx_pre + 1 : idx_pos] # +1 to skip the file seperator
            dir_to_replace = dir_to_replace[0:dir_to_replace.rfind('_')]
            pathValue = fields_list[1][:idx_pre+1] + dir_to_replace + fields_list[1][idx_pos:]
        else:
            pathValue = fields_list[1]
        # remove localhost from postgres par_obj_name
        multiOutput_idx = pathValue.find('ModuleOutput')
        if multiOutput_idx > -1:
            pathValue = "'" + pathValue[pathValue.find('ModuleOutput'):] #postgres par_obj_name include losthost 

        newLine = ('INSERT INTO "tsk_files_path" VALUES(' + objValue + ', ' + pathValue + ', ' + ', '.join(fields_list[2:]) + ');') 
        return newLine
    # remove object ID
    elif layout_index:
        obj_id = fields_list[0]
        path= files_table[int(obj_id)]
        newLine = ('INSERT INTO "tsk_file_layout" VALUES(' + path + ', ' + ', '.join(fields_list[1:]) + ');')
        # Remove object ID from Unalloc file name
        newLine = re.sub('Unalloc_[0-9]+_', 'Unalloc_', newLine)
        return newLine
    # remove object ID
    elif object_index:
        obj_id = fields_list[0]
        parent_id = fields_list[1]
        newLine = 'INSERT INTO "tsk_objects" VALUES('
        path = None
        parent_path = None

        #if obj_id or parent_id is invalid literal, we simple return the values as it is 
        try:
            obj_id = int(obj_id)
            if parent_id != 'NULL':
                parent_id = int(parent_id)
        except Exception as e:
            print(obj_id, parent_id)
            return line

        if obj_id in files_table.keys():
            path = files_table[obj_id]
        elif obj_id in vs_parts_table.keys():
            path = vs_parts_table[obj_id]
        elif obj_id in vs_info_table.keys():
            path = vs_info_table[obj_id]
        elif obj_id in fs_info_table.keys():
            path = fs_info_table[obj_id]
        elif obj_id in reports_table.keys():
            path = reports_table[obj_id]
        # remove host name (for multi-user) and dates/times from path for reports
        if path is not None:
            if 'ModuleOutput' in path:
                # skip past the host name (if any)
                path = path[path.find('ModuleOutput'):]
                if 'BulkExtractor' in path or 'Smirk' in path:
                    # chop off the last folder (which contains a date/time)
                    path = path[:path.rfind('\\')]
            if 'Reports\\AutopsyTestCase HTML Report' in path:
                path = 'Reports\\AutopsyTestCase HTML Report'

        if parent_id in files_table.keys():
            parent_path = files_table[parent_id]
        elif parent_id in vs_parts_table.keys():
            parent_path = vs_parts_table[parent_id]
        elif parent_id in vs_info_table.keys():
            parent_path = vs_info_table[parent_id]
        elif parent_id in fs_info_table.keys():
            parent_path = fs_info_table[parent_id]
        elif parent_id in images_table.keys():
            parent_path = images_table[parent_id]
        elif parent_id in accounts_table.keys():
            parent_path = accounts_table[parent_id]
        elif parent_id == 'NULL':
            parent_path = "NULL"
        
        # Remove host name (for multi-user) from parent_path
        if parent_path is not None:
            if 'ModuleOutput' in parent_path:
                # skip past the host name (if any)
                parent_path = parent_path[parent_path.find('ModuleOutput'):]

        if path and parent_path:
            # Remove object ID from Unalloc file names and regripper output
            path = re.sub('Unalloc_[0-9]+_', 'Unalloc_', path)
            path = re.sub('regripper\-[0-9]+\-full', 'regripper-full', path)
            parent_path = re.sub('Unalloc_[0-9]+_', 'Unalloc_', parent_path)
            parent_path = re.sub('regripper\-[0-9]+\-full', 'regripper-full', parent_path)
            return newLine + path + ', ' + parent_path + ', ' + ', '.join(fields_list[2:]) + ');'
        else:
            return newLine + '"OBJECT IDS OMITTED", ' + ', '.join(fields_list[2:]) + ');'  #omit parent object id and object id when we cant annonymize them
    # remove time-based information, ie Test_6/11/14 -> Test    
    elif report_index:
        fields_list[1] = "AutopsyTestCase"
        fields_list[2] = "0"
        newLine = ('INSERT INTO "reports" VALUES(' + ','.join(fields_list[1:]) + ');') # remove report_id
        return newLine
    elif data_source_info_index:
        fields_list[1] = "{device id}"
        fields_list[4] = "{dateTime}"
        newLine = ('INSERT INTO "data_source_info" VALUES(' + ','.join(fields_list) + ');')
        return newLine
    elif ingest_job_index:
        fields_list[2] = "{host_name}"
        start_time = int(fields_list[3])
        end_time = int(fields_list[4])
        if (start_time <= end_time):
            fields_list[3] = "0"
            fields_list[4] = "0"
        newLine = ('INSERT INTO "ingest_jobs" VALUES(' + ','.join(fields_list) + ');')
        return newLine
    elif examiners_index:
        fields_list[1] = "{examiner_name}"
        newLine = ('INSERT INTO "tsk_examiners" VALUES(' + ','.join(fields_list) + ');')
        return newLine
    # remove all timing dependent columns from events table
    elif events_index:
        newLine = ('INSERT INTO "tsk_events" VALUES(' + ','.join(fields_list[1:2]) + ');') 
        return newLine
    # remove object ids from event description table
    elif event_description_index:
        # replace object ids with information that is deterministic 
        file_obj_id = int(fields_list[5])
        object_id = int(fields_list[4])
        legacy_artifact_id = 'NULL'
        if (fields_list[6] != 'NULL'):
            legacy_artifact_id = int(fields_list[6])
        if file_obj_id != 'NULL' and file_obj_id in files_table.keys():
            fields_list[5] = files_table[file_obj_id]
        if object_id != 'NULL' and object_id in files_table.keys():
            fields_list[4] = files_table[object_id]
        if legacy_artifact_id != 'NULL' and legacy_artifact_id in artifact_table.keys():
            fields_list[6] = artifact_table[legacy_artifact_id]
        if fields_list[1] == fields_list[2] and fields_list[1] == fields_list[3]:	
            fields_list[1] = cleanupEventDescription(fields_list[1])
            fields_list[2] = cleanupEventDescription(fields_list[2])
            fields_list[3] = cleanupEventDescription(fields_list[3])
        newLine = ('INSERT INTO "tsk_event_descriptions" VALUES(' + ','.join(fields_list[1:]) + ');') # remove report_id
        return newLine
    elif os_account_index:
        newLine = ('INSERT INTO "tsk_os_accounts" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id since value that would be substituted is in diff line already
        return newLine
    elif os_account_attr_index:
        #substitue the account object id for a non changing value
        os_account_id = int(fields_list[1])
        fields_list[1] = accounts_table[os_account_id]
        #substitue the source object id for a non changing value
        source_obj_id = int(fields_list[3])
        if source_obj_id in files_table.keys():
            fields_list[3] = files_table[source_obj_id]
        elif source_obj_id in vs_parts_table.keys():
            fields_list[3] = vs_parts_table[source_obj_id]
        elif source_obj_id in vs_info_table.keys():
            fields_list[3] = vs_info_table[source_obj_id]
        elif source_obj_id in fs_info_table.keys():
            fields_list[3] = fs_info_table[source_obj_id]
        elif source_obj_id in images_table.keys():
            fields_list[3] = images_table[source_obj_id]
        elif source_obj_id in accounts_table.keys():
            fields_list[3] = accounts_table[source_obj_id]
        elif source_obj_id == 'NULL':
            fields_list[3] = "NULL"
        newLine = ('INSERT INTO "tsk_os_account_attributes" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id
        return newLine
    elif os_account_instances_index:
        os_account_id = int(fields_list[1])
        fields_list[1] = accounts_table[os_account_id]
        newLine = ('INSERT INTO "tsk_os_account_instances" VALUES(' + ','.join(fields_list[1:]) + ');') # remove id
        return newLine
    elif data_artifacts_index:
        art_obj_id = int(fields_list[0])
        if art_obj_id in files_table.keys():
            fields_list[0] = files_table[art_obj_id]
        else:
            fields_list[0] = 'Artifact Object ID Omitted'
        account_obj_id = int(fields_list[1])
        if account_obj_id in files_table.keys():
            fields_list[1] = files_table[account_obj_id]
        else:
            fields_list[1] = 'Account Object ID Omitted'
        newLine = ('INSERT INTO "tsk_data_artifacts" VALUES(' + ','.join(fields_list[:]) + ');') # remove ids
        return newLine
    else:
        return line
        
def cleanupEventDescription(description):
    test = re.search("^'\D+:\d+'$", description)
    if test is not None:
        return re.sub(":\d+", ":<artifact_id>", description)
    else:
        return description

def getAssociatedArtifactType(cur, artifact_id, isMultiUser):
    if isMultiUser:
        cur.execute("SELECT tsk_files.parent_path, blackboard_artifact_types.display_name FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id WHERE artifact_id=%s",[artifact_id])
    else:
        cur.execute("SELECT tsk_files.parent_path, blackboard_artifact_types.display_name FROM blackboard_artifact_types INNER JOIN blackboard_artifacts ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id INNER JOIN tsk_files ON tsk_files.obj_id = blackboard_artifacts.obj_id WHERE artifact_id=?",[artifact_id])

    info = cur.fetchone()
    
    return "File path: " + info[0] + " Artifact Type: " + info[1]

def build_id_files_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to file paths.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id, parent path, and name, then create a tuple in the dictionary
    # with the object id as the key and the full file path (parent + name) as the value
    mapping = dict([(row[0], str(row[1]) + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, parent_path, name FROM tsk_files")])
    return mapping

def build_id_vs_parts_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to vs_parts.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id, addr, and start, then create a tuple in the dictionary
    # with the object id as the key and (addr + start) as the value
    mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, addr, start FROM tsk_vs_parts")])
    return mapping

def build_id_vs_info_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to vs_info.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id, vs_type, and img_offset, then create a tuple in the dictionary
    # with the object id as the key and (vs_type + img_offset) as the value
    mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, vs_type, img_offset FROM tsk_vs_info")])
    return mapping

     
def build_id_fs_info_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to fs_info.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id, img_offset, and fs_type, then create a tuple in the dictionary
    # with the object id as the key and (img_offset + fs_type) as the value
    mapping = dict([(row[0], str(row[1]) + '_' + str(row[2])) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, img_offset, fs_type FROM tsk_fs_info")])
    return mapping

def build_id_objects_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to par_id.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id, par_obj_id, then create a tuple in the dictionary
    # with the object id as the key and par_obj_id, type as the value
    mapping = dict([(row[0], [row[1], row[2]]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT * FROM tsk_objects")])
    return mapping

def build_id_image_names_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to name.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id and name then create a tuple in the dictionary
    # with the object id as the key and name, type as the value
    mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, name FROM tsk_image_names WHERE sequence=0")])
    #data_sources which are logical file sets will be found in the files table
    return mapping

def build_id_artifact_types_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to artifact ids.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id, par_obj_id, then create a tuple in the dictionary
    # with the object id as the key and artifact type as the value
    mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT blackboard_artifacts.artifact_obj_id, blackboard_artifact_types.type_name FROM blackboard_artifacts INNER JOIN blackboard_artifact_types ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id ")])
    return mapping

def build_id_legacy_artifact_types_table(db_cursor, isPostgreSQL):
    """Build the map of legacy artifact ids to artifact type.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the legacy artifact id then create a tuple in the dictionary
    # with the artifact id as the key and artifact type as the value
    mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT blackboard_artifacts.artifact_id, blackboard_artifact_types.type_name FROM blackboard_artifacts INNER JOIN blackboard_artifact_types ON blackboard_artifact_types.artifact_type_id = blackboard_artifacts.artifact_type_id ")])
    return mapping

def build_id_reports_table(db_cursor, isPostgreSQL):
    """Build the map of report object ids to report path.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the reports table in the db, create an obj_id -> path map
    mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT obj_id, path FROM reports")])
    return mapping

def build_id_accounts_table(db_cursor, isPostgreSQL):
    """Build the map of object ids to OS account SIDs.

    Args:
        db_cursor: the database cursor
    """
    # for each row in the db, take the object id and account SID then creates a tuple in the dictionary
    # with the object id as the key and the OS Account's SID as the value
    mapping = dict([(row[0], row[1]) for row in sql_select_execute(db_cursor, isPostgreSQL, "SELECT os_account_obj_id, addr  FROM tsk_os_accounts")])
    return mapping

def build_id_obj_path_table(files_table, objects_table, artifacts_table, reports_table, images_table, accounts_table):
    """Build the map of object ids to artifact ids.

    Args:
        files_table: obj_id, path
        objects_table: obj_id, par_obj_id, type
        artifacts_table: obj_id, artifact_type_name
        reports_table: obj_id, path
        images_table: obj_id, name
        accounts_table: obj_id, addr  
    """
    # make a copy of files_table and update it with new data from artifacts_table and reports_table
    mapping = files_table.copy()
    for k, v in objects_table.items():
        path = ""
        if k not in mapping.keys(): # If the mapping table doesn't have data for obj_id
            if k in reports_table.keys(): # For a report we use the report path
                par_obj_id = v[0]
                if par_obj_id is not None:
                    mapping[k] = reports_table[k]
            elif k in artifacts_table.keys(): # For an artifact we use it's par_obj_id's path+name plus it's artifact_type name
                par_obj_id = v[0] # The parent of an artifact can be a file or a report
                if par_obj_id in mapping.keys():
                    path = mapping[par_obj_id]
                elif par_obj_id in reports_table.keys():
                    path = reports_table[par_obj_id]
                elif par_obj_id in images_table.keys():
                    path = images_table[par_obj_id]
                mapping[k] = path + "/" + artifacts_table[k]
            elif k in accounts_table.keys(): # For an OS Account object ID we use its addr  field which is the account SID
                mapping[k] = accounts_table[k]
        elif v[0] not in mapping.keys():
            if v[0] in artifacts_table.keys():
                par_obj_id = objects_table[v[0]]
                path = mapping[par_obj_id] 
                mapping[k] = path + "/" + artifacts_table[v[0]]
    return mapping

def db_connect(db_file, isMultiUser, pgSettings=None):
    if isMultiUser: # use PostgreSQL
        try:
            return psycopg2.connect("dbname=" + db_file + " user=" + pgSettings.username + " host=" + pgSettings.pgHost + " password=" + pgSettings.password), None
        except:
            print("Failed to connect to the database: " + db_file)
    else: # Sqlite
        # Make a copy that we can modify
        backup_db_file = TskDbDiff._get_tmp_file("tsk_backup_db", ".db")
        shutil.copy(db_file, backup_db_file)
        # We sometimes get situations with messed up permissions
        os.chmod (backup_db_file, 0o777)
        return sqlite3.connect(backup_db_file), backup_db_file

def sql_select_execute(cursor, isPostgreSQL, sql_stmt):
    if isPostgreSQL: 
        cursor.execute(sql_stmt)
        return cursor.fetchall()
    else:
        return cursor.execute(sql_stmt)

def main():
    try:
        sys.argv.pop(0)
        output_db = sys.argv.pop(0)
        gold_db = sys.argv.pop(0)
    except:
        print("usage: tskdbdiff [OUTPUT DB PATH] [GOLD DB PATH]")
        sys.exit(1)

    db_diff = TskDbDiff(output_db, gold_db, output_dir=".") 
    dump_passed, bb_dump_passed = db_diff.run_diff()

    if dump_passed and bb_dump_passed:
        print("Database comparison passed.")
    if not dump_passed:
        print("Non blackboard database comparison failed.")
    if not bb_dump_passed:
        print("Blackboard database comparison failed.")

    sys.exit(0)


if __name__ == "__main__":
    if sys.hexversion < 0x03000000:
        print("Python 3 required")
        sys.exit(1)

    main()