/* * Autopsy Forensic Browser * * Copyright 2012-2014 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.recentactivity; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.logging.Level; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.PlatformUtil; import org.sleuthkit.autopsy.coreutils.XMLUtil; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.ingest.IngestModule.IngestModuleException; import org.sleuthkit.autopsy.ingest.IngestServices; import org.sleuthkit.autopsy.ingest.ModuleDataEvent; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE; import org.sleuthkit.datamodel.BlackboardAttribute; import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * This recent activity extractor attempts to extract web queries from major * search engines by querying the blackboard for web history and bookmark * artifacts, and extracting search text from them. * * * To add search engines, edit SearchEngines.xml under RecentActivity * */ class SearchEngineURLQueryAnalyzer extends Extract { private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName()); private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines; private Content dataSource; private IngestJobContext context; SearchEngineURLQueryAnalyzer() { moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text"); } /** * Stores the regular expression and non-reg exp pair of keys. * Key in the case of "?q=foo" would be "?q=". */ private static class KeyPair { private final String key; private final String keyRegExp; KeyPair (String key, String keyRegExp) { this.key = key; this.keyRegExp = keyRegExp; } String getKey() { return key; } String getKeyRegExp() { return keyRegExp; } } private static class SearchEngine { private final String engineName; private final String domainSubstring; private final List keyPairs; private int count; SearchEngine(String engineName, String domainSubstring, List keyPairs) { this.engineName = engineName; this.domainSubstring = domainSubstring; this.keyPairs = keyPairs; count = 0; } void increment() { ++count; } String getEngineName() { return engineName; } String getDomainSubstring() { return domainSubstring; } int getTotal() { return count; } /** * Get the key values used in the URL to denote the search term * @return */ List getKeys() { return this.keyPairs; } @Override public String toString() { String split = " "; for (KeyPair kp : keyPairs) { split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", "; } return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString", engineName, domainSubstring, count, split); } } private void loadConfigFile() throws IngestModuleException { Document xmlinput; try { String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE; File f = new File(path); logger.log(Level.INFO, "Load successful"); //NON-NLS DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); xmlinput = db.parse(f); if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) { logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS } } catch (IOException e) { throw new IngestModuleException("Was not able to load SEUQAMappings.xml: " + e.getLocalizedMessage()); //NON-NLS } catch (ParserConfigurationException pce) { throw new IngestModuleException("Unable to build XML parser: " + pce.getLocalizedMessage()); //NON-NLS } catch (SAXException sxe) { throw new IngestModuleException("Unable to parse XML file: " + sxe.getLocalizedMessage()); //NON-NLS } NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()]; for (int i = 0; i < nlist.getLength(); i++) { NamedNodeMap nnm = nlist.item(i).getAttributes(); String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS List keys = new ArrayList<>(); NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS for (int k = 0; k < listSplits.getLength(); k++) { if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS keys.add( new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS } } SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys); //System.out.println("Search Engine: " + Se.toString()); listEngines[i] = Se; } engines = listEngines; } /** * Returns which of the supported SearchEngines, if any, the given string * belongs to. * * @param domain domain as part of the URL * @return supported search engine the domain belongs to or null if no match is found * */ private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) { if (engines == null) { return null; } for (SearchEngine engine : engines) { if (domain.contains(engine.getDomainSubstring())) { return engine; } } return null; } /** * Attempts to extract the query from a URL. * * @param url The URL string to be dissected. * @return The extracted search query. */ private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) { String x = ""; //NON-NLS for (KeyPair kp : eng.getKeys()) { if (url.contains(kp.getKey())) { x = getValue(url, kp.getKeyRegExp()); break; } } try { //try to decode the url String decoded = URLDecoder.decode(x, "UTF-8"); //NON-NLS return decoded; } catch (UnsupportedEncodingException uee) { //if it fails, return the encoded string logger.log(Level.FINE, "Error during URL decoding ", uee); //NON-NLS return x; } } /** * Splits URLs based on a delimeter (key). .contains() and .split() * * @param url The URL to be split * @param regExpKey the delimeter value used to split the URL into its search * token, extracted from the xml. * @return The extracted search query * */ private String getValue(String url, String regExpKey) { /* NOTE: This doesn't seem like the most wonderful way to do this, but we have data * that has a bunch of bogus URLs. Such as: * - Multiple google "q=" terms, including one after a "#" tag. Google used the last one * - Search/query part of the URL starting with a '#'. * Attemps at more formal approaches of splitting on the "?" and then on "&" resulting in missing things. */ String value = ""; //NON-NLS String v = regExpKey; //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex if (regExpKey.contains("\\?")) { v = regExpKey.replace("\\?", "?"); } String[] sp = url.split(v); if (sp.length >= 2) { if (sp[sp.length - 1].contains("&")) { value = sp[sp.length - 1].split("&")[0]; } else { value = sp[sp.length - 1]; } } return value; } private void findSearchQueries() { int totalQueries = 0; try { //from blackboard_artifacts Collection listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (`artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS + "' OR `artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') "); //List of every 'web_history' and 'bookmark' artifact NON-NLS logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS for (BlackboardArtifact artifact : listArtifacts) { if (context.isJobCancelled()) { break; //User cancled the process. } //initializing default attributes String query = ""; String searchEngineDomain = ""; String browser = ""; long last_accessed = -1; long fileId = artifact.getObjectID(); boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId); if (!isFromSource) { //File was from a different dataSource. Skipping. continue; } AbstractFile file = tskCase.getAbstractFileById(fileId); if (file == null) { continue; } SearchEngineURLQueryAnalyzer.SearchEngine se = null; //from blackboard_attributes Collection listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("Where `artifact_id` = " + artifact.getArtifactID()); //NON-NLS for (BlackboardAttribute attribute : listAttributes) { if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) { final String urlString = attribute.getValueString(); se = getSearchEngineFromUrl(urlString); if (se == null) break; query = extractSearchEngineQuery(se, attribute.getValueString()); if (query.equals("")) //False positive match, artifact was not a query. NON-NLS break; } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) { browser = attribute.getValueString(); } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) { searchEngineDomain = attribute.getValueString(); } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) { last_accessed = attribute.getValueLong(); } } if (se != null && !query.equals("")) { //NON-NLS Collection bbattributes = new ArrayList<>(); bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID(), NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain)); bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT.getTypeID(), NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName"), query)); bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID(), NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName"), browser)); bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID(), NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed)); this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes); se.increment(); ++totalQueries; } } } catch (TskCoreException e) { logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS } finally { if (context.isJobCancelled()) { logger.info("Operation terminated by user."); //NON-NLS } IngestServices.getInstance().fireModuleDataEvent(new ModuleDataEvent( NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"), BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY)); logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS } } private String getTotals() { String total = ""; if (engines == null) { return total; } for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) { total += se.getEngineName() + " : " + se.getTotal() + "\n"; } return total; } @Override public void process(Content dataSource, IngestJobContext context) { this.dataSource = dataSource; this.context = context; this.findSearchQueries(); logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS } @Override void init() throws IngestModuleException { try { PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true); } catch (IOException e) { String message = NbBundle .getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.init.exception.msg", XMLFILE); logger.log(Level.SEVERE, message, e); throw new IngestModuleException(message); } loadConfigFile(); } @Override public void complete() { logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS } }