From 5e4ed7d047b81717ef0c121f926e44b0ee55a9d6 Mon Sep 17 00:00:00 2001 From: Brian Carrier Date: Tue, 19 Aug 2014 10:37:12 -0700 Subject: [PATCH 1/2] refactored search engine, used list to make it more deterministic --- .../SearchEngineURLQueryAnalyzer.java | 230 ++++++++++-------- 1 file changed, 125 insertions(+), 105 deletions(-) diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java index 582240e371..f322e828c9 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java +++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java @@ -24,9 +24,7 @@ import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; +import java.util.List; import java.util.logging.Level; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; @@ -45,7 +43,7 @@ import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE; import org.sleuthkit.datamodel.BlackboardAttribute; import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE; import org.sleuthkit.datamodel.Content; -import org.sleuthkit.datamodel.TskException; +import org.sleuthkit.datamodel.TskCoreException; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.NodeList; @@ -65,13 +63,8 @@ class SearchEngineURLQueryAnalyzer extends Extract { private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName()); private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS - private static String[] searchEngineNames; private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines; - private static Document xmlinput; - private static final SearchEngineURLQueryAnalyzer.SearchEngine NullEngine = new SearchEngineURLQueryAnalyzer.SearchEngine( - NbBundle.getMessage(SearchEngineURLQueryAnalyzer.class, "SearchEngineURLQueryAnalyzer.engineName.none"), - NbBundle.getMessage(SearchEngineURLQueryAnalyzer.class, "SearchEngineURLQueryAnalyzer.domainSubStr.none"), - new HashMap()); + private Content dataSource; private IngestJobContext context; @@ -79,52 +72,100 @@ class SearchEngineURLQueryAnalyzer extends Extract { moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text"); } + /** + * Stores the regular expression and non-reg exp pair of keys. + * Key in the case of "?q=foo" would be "?q=". + */ + private static class KeyPair { + private final String key; + private final String keyRegExp; + + KeyPair (String key, String keyRegExp) { + this.key = key; + this.keyRegExp = keyRegExp; + } + + String getKey() { + return key; + } + + + String getKeyRegExp() { + return keyRegExp; + } + + } private static class SearchEngine { - private String _engineName; - private String _domainSubstring; - private Map _splits; - private int _count; + private final String engineName; + private final String domainSubstring; + private final List keyPairs; + private int count; - SearchEngine(String engineName, String domainSubstring, Map splits) { - _engineName = engineName; - _domainSubstring = domainSubstring; - _splits = splits; - _count = 0; + SearchEngine(String engineName, String domainSubstring, List keyPairs) { + this.engineName = engineName; + this.domainSubstring = domainSubstring; + this.keyPairs = keyPairs; + count = 0; } void increment() { - ++_count; + ++count; } String getEngineName() { - return _engineName; + return engineName; } String getDomainSubstring() { - return _domainSubstring; + return domainSubstring; } int getTotal() { - return _count; + return count; } - Set> getSplits() { - return this._splits.entrySet(); + /** + * Get the key values used in the URL to denote the search term + * @return + */ + List getKeys() { + return this.keyPairs; } @Override public String toString() { String split = " "; - for (Map.Entry kvp : getSplits()) { - split = split + "[ " + kvp.getKey() + " :: " + kvp.getValue() + " ]" + ", "; + for (KeyPair kp : keyPairs) { + split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", "; } return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString", - _engineName, _domainSubstring, _count, split); + engineName, domainSubstring, count, split); } } - private void createEngines() { + private void loadConfigFile() throws IngestModuleException { + Document xmlinput; + try { + String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE; + File f = new File(path); + logger.log(Level.INFO, "Load successful"); //NON-NLS + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + xmlinput = db.parse(f); + + if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) { + logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS + } + + } catch (IOException e) { + throw new IngestModuleException("Was not able to load SEUQAMappings.xml: " + e.getLocalizedMessage()); //NON-NLS + } catch (ParserConfigurationException pce) { + throw new IngestModuleException("Unable to build XML parser: " + pce.getLocalizedMessage()); //NON-NLS + } catch (SAXException sxe) { + throw new IngestModuleException("Unable to parse XML file: " + sxe.getLocalizedMessage()); //NON-NLS + } + NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()]; for (int i = 0; i < nlist.getLength(); i++) { @@ -132,16 +173,17 @@ class SearchEngineURLQueryAnalyzer extends Extract { String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS - Map splits = new HashMap<>(); + List keys = new ArrayList<>(); + NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS for (int k = 0; k < listSplits.getLength(); k++) { if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS - splits.put(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue()); //NON-NLS + keys.add( new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS } } - SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, splits); + SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys); //System.out.println("Search Engine: " + Se.toString()); listEngines[i] = Se; } @@ -153,28 +195,22 @@ class SearchEngineURLQueryAnalyzer extends Extract { * belongs to. * * @param domain domain as part of the URL - * @return supported search engine the domain belongs to, if any + * @return supported search engine the domain belongs to or null if no match is found * */ - private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngine(String domain) { + private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) { if (engines == null) { - return SearchEngineURLQueryAnalyzer.NullEngine; + return null; } - for (int i = 0; i < engines.length; i++) { - if (domain.contains(engines[i].getDomainSubstring())) { - return engines[i]; + for (SearchEngine engine : engines) { + if (domain.contains(engine.getDomainSubstring())) { + return engine; } } - return SearchEngineURLQueryAnalyzer.NullEngine; + return null; } - private void getSearchEngineNames() { - String[] listNames = new String[engines.length]; - for (int i = 0; i < listNames.length; i++) { - listNames[i] = engines[i]._engineName; - } - searchEngineNames = listNames; - } + /** * Attempts to extract the query from a URL. @@ -182,12 +218,12 @@ class SearchEngineURLQueryAnalyzer extends Extract { * @param url The URL string to be dissected. * @return The extracted search query. */ - private String extractSearchEngineQuery(String url) { - String x = "NoQuery"; //NON-NLS - SearchEngineURLQueryAnalyzer.SearchEngine eng = getSearchEngine(url); - for (Map.Entry kvp : eng.getSplits()) { - if (url.contains(kvp.getKey())) { - x = split2(url, kvp.getValue()); + private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) { + String x = ""; //NON-NLS + + for (KeyPair kp : eng.getKeys()) { + if (url.contains(kp.getKey())) { + x = getValue(url, kp.getKeyRegExp()); break; } } @@ -204,38 +240,48 @@ class SearchEngineURLQueryAnalyzer extends Extract { * Splits URLs based on a delimeter (key). .contains() and .split() * * @param url The URL to be split - * @param value the delimeter value used to split the URL into its search + * @param regExpKey the delimeter value used to split the URL into its search * token, extracted from the xml. * @return The extracted search query * */ - private String split2(String url, String value) { - String basereturn = "NoQuery"; //NON-NLS - String v = value; + private String getValue(String url, String regExpKey) { + /* NOTE: This doesn't seem like the most wonderful way to do this, but we have data + * that has a bunch of bogus URLs. Such as: + * - Multiple google "q=" terms, including one after a "#" tag. Google used the last one + * - Search/query part of the URL starting with a '#'. + * Attemps at more formal approaches of splitting on the "?" and then on "&" resulting in missing things. + */ + String value = ""; //NON-NLS + String v = regExpKey; //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex - if (value.contains("\\?")) { - v = value.replace("\\?", "?"); + if (regExpKey.contains("\\?")) { + v = regExpKey.replace("\\?", "?"); } String[] sp = url.split(v); if (sp.length >= 2) { if (sp[sp.length - 1].contains("&")) { - basereturn = sp[sp.length - 1].split("&")[0]; + value = sp[sp.length - 1].split("&")[0]; } else { - basereturn = sp[sp.length - 1]; + value = sp[sp.length - 1]; } } - return basereturn; + return value; } - private void getURLs() { + private void findSearchQueries() { int totalQueries = 0; try { //from blackboard_artifacts Collection listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (`artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS + "' OR `artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') "); //List of every 'web_history' and 'bookmark' artifact NON-NLS logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS - getAll: + for (BlackboardArtifact artifact : listArtifacts) { + if (context.isJobCancelled()) { + break; //User cancled the process. + } + //initializing default attributes String query = ""; String searchEngineDomain = ""; @@ -254,25 +300,21 @@ class SearchEngineURLQueryAnalyzer extends Extract { continue; } - SearchEngineURLQueryAnalyzer.SearchEngine se = NullEngine; + SearchEngineURLQueryAnalyzer.SearchEngine se = null; //from blackboard_attributes Collection listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("Where `artifact_id` = " + artifact.getArtifactID()); //NON-NLS - getAttributes: + for (BlackboardAttribute attribute : listAttributes) { - if (context.isJobCancelled()) { - break getAll; //User cancled the process. - } if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) { final String urlString = attribute.getValueString(); - se = getSearchEngine(urlString); - if (!se.equals(NullEngine)) { - query = extractSearchEngineQuery(attribute.getValueString()); - if (query.equals("NoQuery") || query.equals("")) { //False positive match, artifact was not a query. NON-NLS - break getAttributes; - } - } else if (se.equals(NullEngine)) { - break getAttributes; //could not determine type. Will move onto next artifact - } + se = getSearchEngineFromUrl(urlString); + if (se == null) + break; + + query = extractSearchEngineQuery(se, attribute.getValueString()); + if (query.equals("")) //False positive match, artifact was not a query. NON-NLS + break; + } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) { browser = attribute.getValueString(); } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) { @@ -282,7 +324,7 @@ class SearchEngineURLQueryAnalyzer extends Extract { } } - if (!se.equals(NullEngine) && !query.equals("NoQuery") && !query.equals("")) { //NON-NLS + if (se != null && !query.equals("")) { //NON-NLS Collection bbattributes = new ArrayList<>(); bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID(), NbBundle.getMessage(this.getClass(), @@ -301,7 +343,7 @@ class SearchEngineURLQueryAnalyzer extends Extract { ++totalQueries; } } - } catch (TskException e) { + } catch (TskCoreException e) { logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS } finally { if (context.isJobCancelled()) { @@ -329,46 +371,24 @@ class SearchEngineURLQueryAnalyzer extends Extract { public void process(Content dataSource, IngestJobContext context) { this.dataSource = dataSource; this.context = context; - this.getURLs(); + this.findSearchQueries(); logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS } @Override void init() throws IngestModuleException { try { - PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, false); - init2(); + PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true); } catch (IOException e) { String message = NbBundle .getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.init.exception.msg", XMLFILE); logger.log(Level.SEVERE, message, e); throw new IngestModuleException(message); } + + loadConfigFile(); } - private void init2() { - try { - String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE; - File f = new File(path); - logger.log(Level.INFO, "Load successful"); //NON-NLS - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - DocumentBuilder db = dbf.newDocumentBuilder(); - Document xml = db.parse(f); - xmlinput = xml; - - if (!XMLUtil.xmlIsValid(xml, SearchEngineURLQueryAnalyzer.class, XSDFILE)) { - logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS - } - createEngines(); - getSearchEngineNames(); - } catch (IOException e) { - logger.log(Level.SEVERE, "Was not able to load SEUQAMappings.xml", e); //NON-NLS - } catch (ParserConfigurationException pce) { - logger.log(Level.SEVERE, "Unable to build XML parser", pce); //NON-NLS - } catch (SAXException sxe) { - logger.log(Level.SEVERE, "Unable to parse XML file", sxe); //NON-NLS - } - } @Override public void complete() { From 357d43f4e3ecacebd7b031cb2a855eafe71ec276 Mon Sep 17 00:00:00 2001 From: Brian Carrier Date: Tue, 19 Aug 2014 13:54:33 -0400 Subject: [PATCH 2/2] Forced regression scripts to use python 3 --- test/script/regression.py | 5 +++++ test/script/tskdbdiff.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/test/script/regression.py b/test/script/regression.py index 57ff4382a1..e4a58cbf7b 100755 --- a/test/script/regression.py +++ b/test/script/regression.py @@ -1931,6 +1931,11 @@ class OS: LINUX, MAC, WIN, CYGWIN = range(4) if __name__ == "__main__": + + if sys.hexversion < 0x03000000: + print("Python 3 required") + sys.exit(1) + global SYS if _platform == "linux" or _platform == "linux2": SYS = OS.LINUX diff --git a/test/script/tskdbdiff.py b/test/script/tskdbdiff.py index e107515861..e5a2ec2dc0 100755 --- a/test/script/tskdbdiff.py +++ b/test/script/tskdbdiff.py @@ -398,5 +398,9 @@ def main(): if __name__ == "__main__": + if sys.hexversion < 0x03000000: + print("Python 3 required") + sys.exit(1) + sys.exit(main())