From 5e4ed7d047b81717ef0c121f926e44b0ee55a9d6 Mon Sep 17 00:00:00 2001
From: Brian Carrier <carrier@sleuthkit.org>
Date: Tue, 19 Aug 2014 10:37:12 -0700
Subject: [PATCH 1/2] refactored search engine, used list to make it more
 deterministic

---
 .../SearchEngineURLQueryAnalyzer.java         | 230 ++++++++++--------
 1 file changed, 125 insertions(+), 105 deletions(-)
diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java
index 582240e371..f322e828c9 100644
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java
+++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java
@@ -24,9 +24,7 @@ import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
+import java.util.List;
 import java.util.logging.Level;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
@@ -45,7 +43,7 @@ import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
 import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.TskException;
+import org.sleuthkit.datamodel.TskCoreException;
 import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.NodeList;
@@ -65,13 +63,8 @@ class SearchEngineURLQueryAnalyzer extends Extract {
     private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
     private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
     private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
-    private static String[] searchEngineNames;
     private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
-    private static Document xmlinput;
-    private static final SearchEngineURLQueryAnalyzer.SearchEngine NullEngine = new SearchEngineURLQueryAnalyzer.SearchEngine(
-            NbBundle.getMessage(SearchEngineURLQueryAnalyzer.class, "SearchEngineURLQueryAnalyzer.engineName.none"),
-            NbBundle.getMessage(SearchEngineURLQueryAnalyzer.class, "SearchEngineURLQueryAnalyzer.domainSubStr.none"),
-            new HashMap<String,String>());
+    
     private Content dataSource;
     private IngestJobContext context;
 
@@ -79,52 +72,100 @@ class SearchEngineURLQueryAnalyzer extends Extract {
         moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text");
     }
 
+    /**
+     * Stores the regular expression and non-reg exp pair of keys.
+     * Key in the case of "?q=foo" would be "?q=". 
+     */
+    private static class KeyPair {
+        private final String key;
+        private final String keyRegExp;
+        
+        KeyPair (String key, String keyRegExp) {
+            this.key = key;
+            this.keyRegExp = keyRegExp;
+        }
+
+        String getKey() {
+            return key;
+        }
+
+        
+        String getKeyRegExp() {
+            return keyRegExp;
+        }
+        
+    }
     private static class SearchEngine {
 
-        private String _engineName;
-        private String _domainSubstring;
-        private Map<String, String> _splits;
-        private int _count;
+        private final String engineName;
+        private final String domainSubstring;
+        private final List<KeyPair> keyPairs;
+        private int count;
 
-        SearchEngine(String engineName, String domainSubstring, Map<String, String> splits) {
-            _engineName = engineName;
-            _domainSubstring = domainSubstring;
-            _splits = splits;
-            _count = 0;
+        SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
+            this.engineName = engineName;
+            this.domainSubstring = domainSubstring;
+            this.keyPairs = keyPairs;
+            count = 0;
         }
 
         void increment() {
-            ++_count;
+            ++count;
         }
 
         String getEngineName() {
-            return _engineName;
+            return engineName;
         }
 
         String getDomainSubstring() {
-            return _domainSubstring;
+            return domainSubstring;
         }
 
         int getTotal() {
-            return _count;
+            return count;
         }
 
-        Set<Map.Entry<String, String>> getSplits() {
-            return this._splits.entrySet();
+        /**
+         * Get the key values used in the URL to denote the search term
+         * @return 
+         */
+        List<KeyPair> getKeys() {
+            return this.keyPairs;
         }
 
         @Override
         public String toString() {
             String split = " ";
-            for (Map.Entry<String, String> kvp : getSplits()) {
-                split = split + "[ " + kvp.getKey() + " :: " + kvp.getValue() + " ]" + ", ";
+            for (KeyPair kp : keyPairs) {
+                split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
             }
             return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
-                                       _engineName, _domainSubstring, _count, split);
+                                       engineName, domainSubstring, count, split);
         }
     }
 
-    private void createEngines() {
+    private void loadConfigFile() throws IngestModuleException {
+        Document xmlinput;
+        try {
+            String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
+            File f = new File(path);
+            logger.log(Level.INFO, "Load successful"); //NON-NLS
+            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+            DocumentBuilder db = dbf.newDocumentBuilder();
+            xmlinput = db.parse(f);
+            
+            if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
+                logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
+            }
+
+        } catch (IOException e) {
+            throw new IngestModuleException("Was not able to load SEUQAMappings.xml: " + e.getLocalizedMessage()); //NON-NLS
+        } catch (ParserConfigurationException pce) {
+            throw new IngestModuleException("Unable to build XML parser: " + pce.getLocalizedMessage()); //NON-NLS
+        } catch (SAXException sxe) {
+            throw new IngestModuleException("Unable to parse XML file: " + sxe.getLocalizedMessage()); //NON-NLS
+        }
+        
         NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
         SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
         for (int i = 0; i < nlist.getLength(); i++) {
@@ -132,16 +173,17 @@ class SearchEngineURLQueryAnalyzer extends Extract {
 
             String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
             String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
-            Map<String, String> splits = new HashMap<>();
+            List<KeyPair> keys = new ArrayList<>();
+            
 
             NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
             for (int k = 0; k < listSplits.getLength(); k++) {
                 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
-                    splits.put(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue()); //NON-NLS
+                    keys.add( new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
                 }
             }
 
-            SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, splits);
+            SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
             //System.out.println("Search Engine: " + Se.toString());
             listEngines[i] = Se;
         }
@@ -153,28 +195,22 @@ class SearchEngineURLQueryAnalyzer extends Extract {
      * belongs to.
      *
      * @param domain domain as part of the URL
-     * @return supported search engine the domain belongs to, if any
+     * @return supported search engine the domain belongs to or null if no match is found
      *
      */
-    private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngine(String domain) {
+    private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
         if (engines == null) {
-            return SearchEngineURLQueryAnalyzer.NullEngine;
+            return null;
         }
-        for (int i = 0; i < engines.length; i++) {
-            if (domain.contains(engines[i].getDomainSubstring())) {
-                return engines[i];
+        for (SearchEngine engine : engines) {
+            if (domain.contains(engine.getDomainSubstring())) {
+                return engine;
             }
         }
-        return SearchEngineURLQueryAnalyzer.NullEngine;
+        return null;
     }
 
-    private void getSearchEngineNames() {
-        String[] listNames = new String[engines.length];
-        for (int i = 0; i < listNames.length; i++) {
-            listNames[i] = engines[i]._engineName;
-        }
-        searchEngineNames = listNames;
-    }
+
 
     /**
      * Attempts to extract the query from a URL.
@@ -182,12 +218,12 @@ class SearchEngineURLQueryAnalyzer extends Extract {
      * @param url The URL string to be dissected.
      * @return The extracted search query.
      */
-    private String extractSearchEngineQuery(String url) {
-        String x = "NoQuery"; //NON-NLS
-        SearchEngineURLQueryAnalyzer.SearchEngine eng = getSearchEngine(url);
-        for (Map.Entry<String, String> kvp : eng.getSplits()) {
-            if (url.contains(kvp.getKey())) {
-                x = split2(url, kvp.getValue());
+    private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
+        String x = ""; //NON-NLS
+
+        for (KeyPair kp : eng.getKeys()) {
+            if (url.contains(kp.getKey())) {
+                x = getValue(url, kp.getKeyRegExp());
                 break;
             }
         }
@@ -204,38 +240,48 @@ class SearchEngineURLQueryAnalyzer extends Extract {
      * Splits URLs based on a delimeter (key). .contains() and .split()
      *
      * @param url The URL to be split
-     * @param value the delimeter value used to split the URL into its search
+     * @param regExpKey the delimeter value used to split the URL into its search
      * token, extracted from the xml.
      * @return The extracted search query
      *
      */
-    private String split2(String url, String value) {
-        String basereturn = "NoQuery"; //NON-NLS
-        String v = value;
+    private String getValue(String url, String regExpKey) {
+        /* NOTE: This doesn't seem like the most wonderful way to do this, but we have data 
+         * that has a bunch of bogus URLs.  Such as:
+         * - Multiple google "q=" terms, including one after a "#" tag.  Google used the last one
+         * - Search/query part of the URL starting with a '#'.
+         * Attemps at more formal approaches of splitting on the "?" and then on "&" resulting in missing things. 
+        */
+        String value = ""; //NON-NLS
+        String v = regExpKey;
         //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
-        if (value.contains("\\?")) {
-            v = value.replace("\\?", "?");
+        if (regExpKey.contains("\\?")) {
+            v = regExpKey.replace("\\?", "?");
         }
         String[] sp = url.split(v);
         if (sp.length >= 2) {
             if (sp[sp.length - 1].contains("&")) {
-                basereturn = sp[sp.length - 1].split("&")[0];
+                value = sp[sp.length - 1].split("&")[0];
             } else {
-                basereturn = sp[sp.length - 1];
+                value = sp[sp.length - 1];
             }
         }
-        return basereturn;
+        return value;
     }
 
-    private void getURLs() {
+    private void findSearchQueries() {
         int totalQueries = 0;
         try {
             //from blackboard_artifacts
             Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (`artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS
                     + "' OR `artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') ");  //List of every 'web_history' and 'bookmark' artifact NON-NLS
             logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
-            getAll:
+            
             for (BlackboardArtifact artifact : listArtifacts) {
+                if (context.isJobCancelled()) {
+                    break;       //User cancled the process.
+                }
+                
                 //initializing default attributes
                 String query = "";
                 String searchEngineDomain = "";
@@ -254,25 +300,21 @@ class SearchEngineURLQueryAnalyzer extends Extract {
                     continue;
                 }
 
-                SearchEngineURLQueryAnalyzer.SearchEngine se = NullEngine;
+                SearchEngineURLQueryAnalyzer.SearchEngine se = null;
                 //from blackboard_attributes
                 Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("Where `artifact_id` = " + artifact.getArtifactID()); //NON-NLS
-                getAttributes:
+
                 for (BlackboardAttribute attribute : listAttributes) {
-                    if (context.isJobCancelled()) {
-                        break getAll;       //User cancled the process.
-                    }
                     if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
                         final String urlString = attribute.getValueString();
-                        se = getSearchEngine(urlString);
-                        if (!se.equals(NullEngine)) {
-                            query = extractSearchEngineQuery(attribute.getValueString());
-                            if (query.equals("NoQuery") || query.equals("")) {   //False positive match, artifact was not a query. NON-NLS
-                                break getAttributes;
-                            }
-                        } else if (se.equals(NullEngine)) {
-                            break getAttributes;    //could not determine type. Will move onto next artifact
-                        }
+                        se = getSearchEngineFromUrl(urlString);
+                        if (se == null) 
+                            break;
+                        
+                        query = extractSearchEngineQuery(se, attribute.getValueString());
+                        if (query.equals(""))    //False positive match, artifact was not a query. NON-NLS
+                            break;
+                         
                     } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
                         browser = attribute.getValueString();
                     } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
@@ -282,7 +324,7 @@ class SearchEngineURLQueryAnalyzer extends Extract {
                     }
                 }
 
-                if (!se.equals(NullEngine) && !query.equals("NoQuery") && !query.equals("")) { //NON-NLS
+                if (se != null && !query.equals("")) { //NON-NLS
                     Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
                     bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID(),
                                                              NbBundle.getMessage(this.getClass(),
@@ -301,7 +343,7 @@ class SearchEngineURLQueryAnalyzer extends Extract {
                     ++totalQueries;
                 }
             }
-        } catch (TskException e) {
+        } catch (TskCoreException e) {
             logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
         } finally {
             if (context.isJobCancelled()) {
@@ -329,46 +371,24 @@ class SearchEngineURLQueryAnalyzer extends Extract {
     public void process(Content dataSource, IngestJobContext context) {
         this.dataSource = dataSource;
         this.context = context;
-        this.getURLs();
+        this.findSearchQueries();
         logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
     }
 
     @Override
     void init() throws IngestModuleException {
         try {
-            PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, false);
-            init2();
+            PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
         } catch (IOException e) {
             String message = NbBundle
                     .getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.init.exception.msg", XMLFILE);
             logger.log(Level.SEVERE, message, e);
             throw new IngestModuleException(message);
         }
+        
+        loadConfigFile();
     }
 
-    private void init2() {
-        try {
-            String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
-            File f = new File(path);
-            logger.log(Level.INFO, "Load successful"); //NON-NLS
-            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
-            DocumentBuilder db = dbf.newDocumentBuilder();
-            Document xml = db.parse(f);
-            xmlinput = xml;
-
-            if (!XMLUtil.xmlIsValid(xml, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
-                logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
-            }
-            createEngines();
-            getSearchEngineNames();
-        } catch (IOException e) {
-            logger.log(Level.SEVERE, "Was not able to load SEUQAMappings.xml", e); //NON-NLS
-        } catch (ParserConfigurationException pce) {
-            logger.log(Level.SEVERE, "Unable to build XML parser", pce); //NON-NLS
-        } catch (SAXException sxe) {
-            logger.log(Level.SEVERE, "Unable to parse XML file", sxe); //NON-NLS
-        }
-    }
 
     @Override
     public void complete() {

From 357d43f4e3ecacebd7b031cb2a855eafe71ec276 Mon Sep 17 00:00:00 2001
From: Brian Carrier <carrier@sleuthkit.org>
Date: Tue, 19 Aug 2014 13:54:33 -0400
Subject: [PATCH 2/2] Forced regression scripts to use python 3

---
 test/script/regression.py | 5 +++++
 test/script/tskdbdiff.py  | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/test/script/regression.py b/test/script/regression.py
index 57ff4382a1..e4a58cbf7b 100755
--- a/test/script/regression.py
+++ b/test/script/regression.py
@@ -1931,6 +1931,11 @@ class OS:
   LINUX, MAC, WIN, CYGWIN = range(4)
 
 if __name__ == "__main__":
+
+    if sys.hexversion < 0x03000000:
+        print("Python 3 required")
+        sys.exit(1)
+
     global SYS
     if _platform == "linux" or _platform == "linux2":
         SYS = OS.LINUX
diff --git a/test/script/tskdbdiff.py b/test/script/tskdbdiff.py
index e107515861..e5a2ec2dc0 100755
--- a/test/script/tskdbdiff.py
+++ b/test/script/tskdbdiff.py
@@ -398,5 +398,9 @@ def main():
 
 
 if __name__ == "__main__":
+    if sys.hexversion < 0x03000000:
+        print("Python 3 required")
+        sys.exit(1)
+
     sys.exit(main())