autopsy-flatpak/Testing/script/regression.py

#!/usr/bin/python
#en_US.UTF-8
import sys
import sqlite3
import re
import subprocess
import os.path
import shutil
import time
import xml
#import thread
#import wmi
# import win32gui
from xml.dom.minidom import parse, parseString


#  Last modified 7/25/12 @2:30pm
#  Usage: ./regression.py [-f FILE] OR [-l CONFIG]  [OPTIONS]
#  Run the RegressionTest.java file, and compare the result with a gold standard
#  When the -f flag is set, this script only tests the image given by FILE.
#  An indexed NSRL database is expected at ./input/nsrl.txt-md5.idx,
#  and an indexed notable hash database at ./input/notablehashes.txt-md5.idx
#  In addition, any keywords to search for must be in ./input/notablekeywords.xml
#  When the -l flag is set, the script looks for a config.xml file of the given name
#  where images are stored. The above input files can be outsourced to different locations
#  from the config.xml. For usage notes please see the example "config.xml" in
#  the /script folder.
#    Options:
#    -r, --rebuild      Rebuild the gold standards from the test results for each image
#    -i, --ignore       Ignores the ./input directory when searching for files
#    -u, --unallocated  Ignores unallocated space when ingesting. Faster, but less accurate results.
#    -d, --delete       Disables the deletion of Solr indexing directory generated by Ingest. Uses more disk space..
#    -v, --verbose      Prints logged warnings after each ingest
#    -e, --exception    When followed by a string, will only print out the exceptions that occured that contain the string. Case sensitive.

hadErrors = False # If any of the tests failed
results = {}      # Dictionary in which to store map ({imgname}->errors)
goldDir = "gold"  # Directory for gold standards (files should be ./gold/{imgname}/standard.db)
inDir = "input"   # Image files, hash dbs, and keywords.
# Results will be in ./output/{datetime}/{imgname}/
outDir = os.path.join("output",time.strftime("%Y.%m.%d-%H.%M"))

CommonLog = ""


# def AutopsyCrash(image, ignoreUnalloc, list):
    # cwd = wgetcwd()
    # x = 20  #seconds to wait between passes
    # name = imageName(image, ignoreUnalloc, list)
    # TestFolder = os.path.join(cwd, outDir, name, "AutopsyTestCase")
    # y = True #default return of 'Crashed'
    # i = 0 #number of passes to run
    # while(i < 2):
      # print "Sleeping background process for %s seconds" %(str(x))
      # time.sleep(x)
      # if(os.path.exists(TestFolder)):
        # y = False # 'Did not Crash'
        # break
      # else:
        # i+=1
    # if y:
        # print "Autopsy failed to initialize properly, restarting from last image..."
        # c = wmi.WMI()
        # for proc in c.Win32_Process():
            # if proc.name == "NetBeans Platform 7.2":
                # proc.kill()
                # break
        # testAddImageIngest(image, ignoreUnalloc, list)
    # return 1


# Run ingest on all the images in 'input', using notablekeywords.xml and notablehashes.txt-md5.idx
def testAddImageIngest(inFile, ignoreUnalloc, list):
  print "================================================"
  print "Ingesting Image: " + inFile

  # Set up case directory path
  testCaseName = imageName(inFile, ignoreUnalloc, list)

  if os.path.exists(os.path.join(outDir,testCaseName)):
    shutil.rmtree(os.path.join(outDir,testCaseName))
  os.makedirs(os.path.join(outDir,testCaseName))
  if not os.path.exists(inDir):
    markError("input dir does not exist", inFile)

  cwd = wgetcwd()
  testInFile = wabspath(inFile)

  global CommonLog
  CommonLog = open(os.path.join(cwd, outDir, imageName(inFile, ignoreUnalloc, list), "CommonLog.txt"), "w") #In this function, because it must be after the makedirs
  # NEEDS windows path (backslashes) for .E00 images to work
  testInFile = testInFile.replace("/", "\\")
  if list:
    knownBadPath = os.path.join(inDir, "notablehashes.txt-md5.idx")
    keywordPath = os.path.join(inDir, "notablekeywords.xml")
    nsrlPath = os.path.join(inDir, "nsrl.txt-md5.idx")
  else:
    knownBadPath = os.path.join(cwd,inDir,"notablehashes.txt-md5.idx")
    keywordPath = os.path.join(cwd,inDir,"notablekeywords.xml")
    nsrlPath = os.path.join(cwd,inDir,"nsrl.txt-md5.idx")

  knownBadPath = knownBadPath.replace("/", "\\")
  keywordPath = keywordPath.replace("/", "\\")
  nsrlPath = nsrlPath.replace("/", "\\")

  antlog = os.path.join(cwd,outDir,testCaseName,"antlog.txt")
  antlog = antlog.replace("/", "\\")

  timeout = 24 * 60 * 60 * 1000    # default of 24 hours, just to be safe
  size = getImageSize(inFile,ignoreUnalloc, list)      # get the size in bytes
  timeout = (size / 1000) / 1000   # convert to MB
  timeout = timeout * 1000         # convert sec to ms
  timeout = timeout * 1.5          # add a little extra umph
  timeout = timeout * 25       # decided we needed A LOT extra to be safe

  # set up ant target
  args = ["ant"]
  args.append("-q")
  args.append("-f")
  args.append(os.path.join("..","build.xml"))
  args.append("regression-test")
  args.append("-l")
  args.append(antlog)
  args.append("-Dimg_path=" + testInFile)
  args.append("-Dknown_bad_path=" + knownBadPath)
  args.append("-Dkeyword_path=" + keywordPath)
  args.append("-Dnsrl_path=" + nsrlPath)
  args.append("-Dgold_path=" + os.path.join(cwd,goldDir).replace("/", "\\"))
  args.append("-Dout_path=" + os.path.join(cwd,outDir,testCaseName).replace("/", "\\"))
  args.append("-Dignore_unalloc=" + "%s" % ignoreUnalloc)
  args.append("-Dtest.timeout=" + str(timeout))

  # print the ant testing command
  print "CMD: " + " ".join(args)

  print "Starting test..."
  # thread.start_new_thread(AutopsyCrash(inFile, ignoreUnalloc, list))
  # thread.start_new_thread(subprocess.call(args))
  subprocess.call(args)


def getImageSize(inFile, ignoreUnalloc, list):
  name = imageName(inFile, ignoreUnalloc, list)
  size = 0
  if list:
    size += os.path.getsize(inFile)
  else:
    path = os.path.join(".",inDir)

    for files in os.listdir(path):
        filename = os.path.splitext(files)[0]
        if filename == name:
            filepath = os.path.join(path, files)
            if not os.path.samefile(filepath, inFile):
                size += os.path.getsize(filepath)
    size += os.path.getsize(inFile)
  return size

def testCompareToGold(inFile, ignoreUnalloc, list):

  global CommonLog
  cwd = wgetcwd()

  print "-----------------------------------------------"
  print "Comparing results for " + inFile + " with gold."

  name = imageName(inFile, ignoreUnalloc, list)

  goldFile = os.path.join("./",goldDir,name,"standard.db")
  testFile = os.path.join("./",outDir,name,"AutopsyTestCase","autopsy.db")
  if os.path.isfile(goldFile) == False:
    markError("No gold standard exists", inFile)
    return
  if os.path.isfile(testFile) == False:
    markError("No database exists", inFile)
    return

  # For now, comparing size of blackboard_artifacts,
  #                            blackboard_attributes,
  #                        and tsk_objects.
  goldConn = sqlite3.connect(goldFile)
  goldC = goldConn.cursor()
  testConn = sqlite3.connect(testFile)
  testC = testConn.cursor()

  CommonLog.write("Comparing Artifacts: \n\r")
  print("Comparing Artifacts: ")

  # Keep range in sync with number of items in ARTIFACT_TYPE enum
  for type_id in range(1, 13):
    goldC.execute("select count(*) from blackboard_artifacts where artifact_type_id=%d" % type_id)
    goldArtifacts = goldC.fetchone()[0]
    testC.execute("select count(*) from blackboard_artifacts where artifact_type_id=%d" % type_id)
    testArtifacts = testC.fetchone()[0]
    if(goldArtifacts != testArtifacts):
      errString = str("Artifact counts do not match for type id %d!: " % type_id)
      errString += str("Gold: %d, Test: %d" % (goldArtifacts, testArtifacts))
      CommonLog.write(errString + "\n\r")
      markError(errString, inFile)
    else:
      CommonLog.write("Artifact counts for artifact type id %d match!" % type_id + "\n\r")
      print("Artifact counts for artifact type id %d match!" % type_id)

  CommonLog.write("Comparing Attributes: \n\r")
  print("Comparing Attributes: ")
  goldC.execute("select count(*) from blackboard_attributes")
  goldAttributes = goldC.fetchone()[0]
  testC.execute("select count(*) from blackboard_attributes")
  testAttributes = testC.fetchone()[0]
  if(goldAttributes != testAttributes):
      errString = "Attribute counts do not match!: "
      errString += str("Gold: %d, Test: %d" % (goldAttributes, testAttributes))
      CommonLog.write(errString + "\n\r")
      markError(errString, inFile)
  else:
      print("Attribute counts match!")
  print("Comparing TSK Objects: ")
  goldC.execute("select count(*) from tsk_objects")
  goldObjects = goldC.fetchone()[0]
  testC.execute("select count(*) from tsk_objects")
  testObjects = testC.fetchone()[0]
  if(goldObjects != testObjects):
      errString = "TSK Object counts do not match!: "
      errString += str("Gold: %d, Test: %d" % (goldObjects, testObjects))
      CommonLog.write(errString + "\n\r")
      markError(errString, inFile)
  else:
      CommonLog.write("Object counts match!" + "\n\r")
      print("Object counts match!")

def clearGoldDir(inFile, ignoreUnalloc, list):

  cwd = wgetcwd()
  inFile = imageName(inFile, ignoreUnalloc, list)
  if os.path.exists(os.path.join(cwd,goldDir,inFile)):
    shutil.rmtree(os.path.join(cwd,goldDir,inFile))
  os.makedirs(os.path.join(cwd,goldDir,inFile))
  print "Clearing gold directory: " + os.path.join(cwd,goldDir,inFile)

def copyTestToGold(inFile, ignoreUnalloc, list):
  print "------------------------------------------------"
  print "Recreating gold standard from results."
  inFile = imageName(inFile, ignoreUnalloc, list)
  cwd = wgetcwd()
  goldFile = os.path.join("./",goldDir,inFile,"standard.db")
  testFile = os.path.join("./",outDir,inFile,"AutopsyTestCase","autopsy.db")
  shutil.copy(testFile, goldFile)
  print "Recreated gold standards"

def copyReportToGold(inFile, ignoreUnalloc, list):
  print "------------------------------------------------"
  print "Recreating gold report from results."
  inFile = imageName(inFile, ignoreUnalloc, list)
  cwd = wgetcwd()
  goldReport = os.path.join("./",goldDir,inFile,"report.html")
  testReportPath = os.path.join("./",outDir,inFile,"AutopsyTestCase","Reports")
  # Because Java adds a timestamp to the report file, one can't call it
  # directly, so one must get a list of files in the dir, which are only
  # reports, then filter for the .html report
  testReport = None
  for files in os.listdir(testReportPath):
    if files.endswith(".html"): # Get the HTML one
      testReport = os.path.join("./",outDir,inFile,"AutopsyTestCase","Reports",files)
  if testReport is None:
    markError("No test report exists", inFile)
    return
  else:
    shutil.copy(testReport, goldReport)
  print "Report copied"

def deleteKeywordFiles(inFile, ignoreUnalloc, list):
    print "------------------------------------------------"
    print "Deleting Keyword Search files"
    inFile = imageName(inFile, ignoreUnalloc, list)
    cwd = wgetcwd()
    shutil.rmtree(os.path.join("./", outDir, inFile, "AutopsyTestCase", "KeywordSearch"))

def testCompareReports(inFile, ignoreUnalloc, list):
  print "------------------------------------------------"
  print "Comparing report to golden report."
  name = imageName(inFile, ignoreUnalloc, list)
  goldReport = os.path.join("./",goldDir,name,"report.html")
  testReportPath = os.path.join("./",outDir,name,"AutopsyTestCase","Reports")
  # Because Java adds a timestamp to the report file, one can't call it
  # directly, so one must get a list of files in the dir, which are only
  # reports, then filter for the .html report
  testReport = None
  for files in os.listdir(testReportPath):
    if files.endswith(".html"): # Get the HTML one
      testReport = os.path.join("./",outDir,name,"AutopsyTestCase","Reports",files)
  if os.path.isfile(goldReport) == False:
    markError("No gold report exists", inFile)
    return
  if testReport is None:
    markError("No test report exists", inFile)
    return
  # Compare the reports
  goldFile = open(goldReport)
  testFile = open(testReport)
  # Search for <ul> because it is first seen in the report
  # immediately after the unnecessary metadata, styles, and timestamp
  gold = goldFile.read()
  test = testFile.read()
  gold = gold[gold.find("<ul>"):]
  test = test[test.find("<ul>"):]
  # Splitting allows for printouts of what the difference is
  goldList = split(gold, 50)
  testList = split(test, 50)
  failed = 0
  for goldSplit, testSplit in zip(goldList, testList):
    if goldSplit != testSplit:
      failed = 1
      #print "Got: " + testSplit
      #print "Expected: " + goldSplit
      break
  if(failed):
    errString = "Reports do not match."
    markError(errString, inFile)
  else:
    print "Reports match.\n\n"

def reportErrors(image, verbose, search):
    global CommonLog
    cwd = wgetcwd()
    files = []
    exceptions = []
    antlog = open(os.path.join(cwd, outDir, image, "antlog.txt"), "r")
    autopsyLog0 = open(os.path.join(cwd, outDir, image, "logs", "autopsy.log.0"), "r")
    messages = open(os.path.join(cwd, outDir, image, "logs", "messages.log"), "r")
    solrLogError0 = open(os.path.join(cwd, outDir, image, "logs", "solr.log.error.0"), "r")
    files = [antlog, autopsyLog0, messages, solrLogError0]
    numWarnings = 0
    numExceptions = 0
    for file in files:
        for line in file:
            if search is not None:
                if(("exception" in line) or ("EXCEPTION" in line) or ("Exception" in line)) and search in line:
                 exceptions.append("From " + file.name[file.name.rfind("/")+1:] +":\n\n" +  line + "\n\n")
                 numExceptions +=1
            else:
                if verbose:
                    if("warning" in line) or ("WARNING" in line) or ("Warning" in line):
                        exceptions.append("From " + file.name[file.name.rfind("/")+1:] +":\n\n" +  line+ "\n\n")
                        numWarnings +=1
                if("exception" in line) or ("EXCEPTION" in line) or ("Exception" in line):
                        exceptions.append("From " + file.name[file.name.rfind("/")+1:] +":\n\n" +  line+ "\n\n")
                        numExceptions +=1
        file.close()
    if verbose:
        CommonLog.write("Included warnings\n\r")
    if search is not None:
        CommonLog.write("Looked for errors with  \"" + search + "\"\n\r")
    CommonLog.write("Total entries: " + str(len(exceptions)) + "\n\r")
    for error in exceptions:
        CommonLog.write(error + "\n\r")
    return (numWarnings, numExceptions)


def split(input, size):
  return [input[start:start+size] for start in range(0, len(input), size)]

class ImgType:
  RAW, ENCASE, SPLIT, UNKNOWN = range(4)

def imageType(inFile):
  extStart = inFile.rfind(".")
  if (extStart == -1):
    return ImgType.UNKNOWN
  ext = inFile[extStart:].lower()
  if (ext == ".img" or ext == ".dd"):
    return ImgType.RAW
  elif (ext == ".e01"):
    return ImgType.ENCASE
  elif (ext == ".aa" or ext == ".001"):
    return ImgType.SPLIT
  else:
    return ImgType.UNKNOWN

def imageName(inFile, ignoreUnalloc, list):
    pathEnd = inFile.rfind("/")
    pathEnd2 = inFile.rfind("\\")
    extStart = inFile.rfind(".")
    if(extStart == -1 and extStart == -1):
        name = inFile
    if(pathEnd2 != -1):
        name = inFile[pathEnd2+1:extStart]
    elif(extStart == -1):
        name = inFile[pathEnd+1:]
    elif(pathEnd == -1):
        name = inFile[:extStart]
    elif(pathEnd!=-1 and extStart!=-1):
        name = inFile[pathEnd+1:extStart]
    else:
        name = inFile[pathEnd2+1:extStart]
    if(ignoreUnalloc):
        name+="-u"
    return name

def markError(errString, inFile):
    global hadErrors
    hadErrors = True
    errors = results.get(inFile, [])
    errors.append(errString)
    results[inFile] = errors
    print errString

def wgetcwd():
    proc = subprocess.Popen(("cygpath", "-m", os.getcwd()), stdout=subprocess.PIPE)
    out,err = proc.communicate()
    return out.rstrip()

def wabspath(inFile):
    if(inFile[1:2] == ":"):
         proc = subprocess.Popen(("cygpath", "-m", inFile), stdout=subprocess.PIPE)
         out,err = proc.communicate()
    else:
        proc = subprocess.Popen(("cygpath", "-m", os.path.abspath(inFile)), stdout=subprocess.PIPE)
        out,err = proc.communicate()
    return out.rstrip()


def copyLogs(inFile, ignoreUnalloc, list):
  name = imageName(inFile, ignoreUnalloc, list)
  logDir = os.path.join("..","build","test","qa-functional","work","userdir0","var","log")
  shutil.copytree(logDir,os.path.join(outDir,name,"logs"))

def testFile(image, rebuild, ignoreUnalloc, list, delete, verbose, search):
  if imageType(image) != ImgType.UNKNOWN:
    testAddImageIngest(image, ignoreUnalloc, list)
    copyLogs(image, ignoreUnalloc, list)
    if rebuild:
      clearGoldDir(image, ignoreUnalloc, list)
      copyTestToGold(image, ignoreUnalloc, list)
      copyReportToGold(image, ignoreUnalloc, list)
    if delete:
        deleteKeywordFiles(image, ignoreUnalloc, list)
    testCompareToGold(image, ignoreUnalloc, list)
    testCompareReports(image, ignoreUnalloc, list)

    name = imageName(image, ignoreUnalloc, list)

    if verbose:
        warnings = reportErrors(name, verbose, search)
        if warnings[0] is not 0:
            print "Warnings: " + str(warnings[0])
        else:
            print "There were no warnings in the logs"
    exceptions = reportErrors(name, verbose, search)
    if exceptions[1] is not 0:
        print "Exceptions: " + str(exceptions[1])
    else:
        print "There were no exceptions in the logs\n\n"


def usage():
  usage = "\
  Usage: ./regression.py [-f FILE] [OPTIONS] \n\n\
  Run the RegressionTest.java file, and compare the result with a gold standard \n\n\
  When the -f flag is set, this script only tests the image given by FILE.\n\
  By default, it tests every image in ./input/\n\n\
  An indexed NSRL database is expected at ./input/nsrl.txt-md5.idx,\n\
  and an indexed notable hash database at ./input/notablehashes.txt-md5.idx\n\
  In addition, any keywords to search for must be in ./input/notablekeywords.xml\n\n\
  When the -l flag is set, the script looks for a config.xml file of the given name\n\
  where images are stored. The above input files may be outsources to a different folder\n\
  via the config file. For usage notes please see the example \"config.xml\" in\n\
  the /script folder.\
    Options:\n\n\
    -r, --rebuild\t\tRebuild the gold standards from the test results for each image.\n\n\
    -i, --ignore\t\tIgnores the ./input directory when searching for files. ONLY use in combinatin with a config file.\n\n\
    -u, --unallocated\t\tIgnores unallocated space when ingesting. Faster, but less accurate results.\n\n\
    -d, --delete\t\tDisables the deletion of Solr indexing directory generated by Ingest. Uses more disk space.\n\n\
    -v, --verbose\t\tPrints logged warnings after each ingest\n\n\
    -e, --exception\t\t when followed by a string, will only print out the exceptions that occured that contain the string. Case sensitive."

  return usage

def main():
  rebuild = False
  single = False
  ignoreInput = False
  ignoreUnalloc = False
  list = False
  test = True
  delete = True
  verbose = False
  search = None
  suppress = False
  argi = 1
  Config = None   #file pointed to by --list
  imgListB = []   #list of legal images from config
  cwd = wgetcwd()

  global CommonLog
  global inDir
  while argi < len(sys.argv):
      arg = sys.argv[argi]
      if arg == "-f" and argi+1 < len(sys.argv): #check for single
          single = True
          test = False
          argi+=1
          image = sys.argv[argi]
          print "Running on single image: " + image
      elif arg == "-l" or arg == "--list":    #check for config file
            list = True
            argi+=1
            #check for file in ./
            if(os.path.isfile(os.path.join("./", sys.argv[argi]))):
                 Config = parse(os.path.join("./", sys.argv[argi]))
            #else check if it is a specified path
            elif (os.path.exists(wabspath(sys.argv[argi]))):
                Config = parse(sys.argv[argi])
            else:
                print sys.argv[argi]
                print wabspath(sys.argv[argi])
                markError("Ran with " + arg +" but no such file exists", arg)
      elif (arg  == "--rebuild") or (arg == "-r"):  #check for rebuild flag
          rebuild = True
          print "Running in REBUILD mode"
      elif (arg == "-u") or (arg == "--unallocated"):    #check for ignore unallocated space flag
          ignoreUnalloc = True
          print "Ignoring unallocated space"
      elif (arg == "--ignore") or (arg == "-i"):
          ignoreInput = True
          print "Ignoring /script/input directory"
      elif (arg == "--delete") or (arg == "-d"):
          delete = False
          print "Will not delete keyword search Solr index"
      elif (arg == "--verbose") or (arg == "-v"):
          verbose = True
          print "Will print warnings and exceptions from logs"
      elif (arg == "--exception") or (arg == "-e"):
          argi+=1
          search = sys.argv[argi]
          print "Searching for exceptions which include \"" + search + "\""
      else:
          test = False
          print usage()
      argi+=1
  if single:
    testFile(image, rebuild, ignoreUnalloc, list, delete, verbose, search)
    CommonLog.close()
  if list:
    listImages = []
    errors = 0
    out = Config.getElementsByTagName("indir")[0].getAttribute("value").encode() #there should only be one indir element in the config
    inDir = out
    print "========================================="
    for element in Config.getElementsByTagName("image"):
        elem = element.getAttribute("value").encode()
        proc2 = subprocess.Popen(("cygpath", "-u", elem), stdout=subprocess.PIPE)
        out2,err = proc2.communicate()
        out2 = out2.rstrip()
        if(os.path.exists(out2) and os.path.isfile(out2)):
            listImages.append(elem)
        else:
            print out2 + " is not a valid path or is not an image\n"
            errors+=1
    print "Illegal files specified: " + str(errors)
    print "Ingesting " + str(len(listImages)) + " images from config"
    print "========================================="
    for image in listImages:
        testFile(image, rebuild, ignoreUnalloc, list, delete, verbose, search)
        CommonLog.close()
    if not ignoreInput:
        inDir = os.path.join(cwd, "input")
        test = True
  if test:
    for inFile in os.listdir(inDir):
      testFile(os.path.join(inDir,inFile), rebuild, ignoreUnalloc, list, delete, verbose, search)
      CommonLog.close()
  if hadErrors == True:
    print "**********************************************"
    print "Tests complete: There were errors"
  else:
    print "**********************************************"
    print "Tests complete: All tests passed"


  for k,v in results.items():
    print k
    for errString in v:
      print("\t%s" % errString)

if __name__ == "__main__":
  main()