Added SearchEngineURLQueryAnalyzer submodule in Recent Activity.

A new tab under Extracted Content, "Web Search Engine Query" holds extracted search query information. Currently supports the following engines: Google, Bing, Yahoo, Baidu, Soso, Sougo, Yandex, Parset, Parseek, Linkestan, Biglobe, Youdao.
2025-07-06 21:00:22 +00:00 · 2012-07-31 15:07:04 -04:00 · 2012-07-31 15:07:04 -04:00 · 4c7645692b
commit 4c7645692b
parent 6cff0f074d
5 changed files with 402 additions and 2 deletions
--- a/CoreComponentInterfaces/src/org/sleuthkit/autopsy/images/searchquery.png
+++ b/CoreComponentInterfaces/src/org/sleuthkit/autopsy/images/searchquery.png
--- a/DataModel/src/org/sleuthkit/autopsy/datamodel/ArtifactTypeNode.java
+++ b/DataModel/src/org/sleuthkit/autopsy/datamodel/ArtifactTypeNode.java
@ -97,6 +97,9 @@ public class ArtifactTypeNode extends AbstractNode implements DisplayableItemNod
                return "recent_docs.png";
            case TSK_DEVICE_ATTACHED:
                return "usb_devices.png";
+            case TSK_WEB_SEARCH_QUERY:
+                return "searchquery.png";
+
        }
        return "artifact-icon.png";
    }
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/RAImageIngestService.java
+++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/RAImageIngestService.java
@ -47,6 +47,7 @@ public final class RAImageIngestService implements IngestServiceImage {
    private Firefox ffre = null;
    private Chrome chre = null;
    private ExtractIE eere = null;
+    private SearchEngineURLQueryAnalyzer usq = null;

    //public constructor is required
    //as multiple instances are created for processing multiple images simultenously
@ -69,6 +70,7 @@ public final class RAImageIngestService implements IngestServiceImage {
        modules.add(ffre);
        modules.add(chre);
        modules.add(eere);
+        modules.add(usq);
        managerProxy.postMessage(IngestMessage.createMessage(++messageId, MessageType.INFO, this, "Started " + image.getName()));
        controller.switchToDeterminate(modules.size());
        controller.progress(0);
@ -131,6 +133,7 @@ public final class RAImageIngestService implements IngestServiceImage {
        this.chre = new Chrome();
        this.eree = new ExtractRegistry();
        this.ffre = new Firefox();
+        this.usq = new SearchEngineURLQueryAnalyzer();

    }

--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java
+++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/SearchEngineURLQueryAnalyzer.java
@ -0,0 +1,391 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.recentactivity;
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.logging.Level;
+import javax.swing.JPanel;
+import org.sleuthkit.autopsy.ingest.IngestImageWorkerController;
+import org.sleuthkit.autopsy.ingest.IngestManagerProxy;
+import org.sleuthkit.autopsy.ingest.IngestServiceImage;
+import org.sleuthkit.autopsy.ingest.ServiceDataEvent;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
+import org.sleuthkit.datamodel.BlackboardAttribute;
+import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
+import org.sleuthkit.datamodel.FsContent;
+import org.sleuthkit.datamodel.Image;
+
+
+
+public class SearchEngineURLQueryAnalyzer extends Extract implements IngestServiceImage{
+    protected String moduleName = "SEUQA";
+
+    
+    private static enum SearchEngine {NONE, Google, Bing, Yahoo, Baidu, Sogou, Soso, Yandex, Youdao, Biglobe, Linkestan, Parseek, Parset};
+
+    
+    
+    
+            
+           
+    SearchEngineURLQueryAnalyzer(){
+
+
+}
+    
+    
+    private SearchEngine getSearchEngine(String domain){
+        if(domain.contains(".com")){
+            String[] d = domain.split(".com");
+            if(d.length != 0 && d[0].contains(".baidu")){
+                return SearchEngine.Baidu;
+            }
+            else if(d.length != 0 && d.length != 0 && d[0].contains(".bing")){
+                return SearchEngine.Bing;
+            }
+            else if(d.length != 0 && d[0].contains(".yahoo")){
+                return SearchEngine.Yahoo;
+            }
+            else if(d.length != 0 && d[0].contains(".google")){
+                return SearchEngine.Google;
+            }
+            else if(d.length != 0 && d[0].contains(".youdao")){
+                return SearchEngine.Youdao;
+            }
+            else if(d.length !=0 && d[0].contains(".soso.com")){
+                return SearchEngine.Soso;
+            }
+            else if(d.length !=0 && d[0].contains(".sogou.com")){
+                return SearchEngine.Sogou;
+            }
+            else if(d.length != 0 && d[0].contains(".linkestan.com")){
+                return SearchEngine.Linkestan;
+            }
+            else if(d.length != 0 && d[0].contains(".parseek.com")){
+                return SearchEngine.Parseek;
+            }
+            else if(d.length !=0 && d[0].contains(".parset.com")){
+                return SearchEngine.Parset;
+            }
+        }
+        else if (domain.contains(".ru")){
+            String[] d = domain.split(".ru");
+            if(d[0].contains("yandex")){
+                return SearchEngine.Yandex;
+            }
+        }
+        else if (domain.contains(".ne.jp")){
+            String[] d = domain.split(".ne.jp");
+            if(d[0].contains("biglobe")){
+                return SearchEngine.Biglobe;
+            }
+        }
+      return SearchEngine.NONE;
+    }
+    
+    private String extractSearchEngineQuery(SearchEngine se, String url){
+        String x = "";
+
+//English Search Engines        
+        
+        //google.com
+        if(se.equals(SearchEngine.Google)){
+            if(url.contains("?q=")){
+                x = split2(url, "\\?q=");
+            }
+            else {
+                x = split2(url, "&q=");
+            }
+        }
+
+        //yahoo.com
+        else if(se.equals(SearchEngine.Yahoo)){
+            x = split2(url, "\\?p=");
+        }
+        
+        //bing.com
+        else if (se.equals(SearchEngine.Bing)){
+            x = split2(url, "\\?q=");
+        }
+        
+//Chinese Search Engines
+        
+        //baidu.com
+        else if (se.equals(SearchEngine.Baidu)){
+            if(url.contains("?wd=")){
+                x = split2(url, "\\?wd=");
+            }
+            else if(url.contains("?kw=")){
+                x = split2(url, "\\?kw=");
+            }
+            else if(url.contains("baidu.com/q?") || url.contains("baidu.com/m?") || url.contains("baidu.com/i?")){
+                x = split2(url, "word=");
+            }
+            else if (url.contains("/qw=") || url.contains("?qw=")){
+                x = split2(url, "\\qw=");
+            }
+            else if (url.contains("bs=")){
+                x = split2(url, "&bs=");
+            }
+        }
+        
+        //sogou.com
+        else if(se.equals(SearchEngine.Sogou)){
+            x = split2(url, "query=");
+        }
+        
+        //Soso.com
+        else if (se.equals(SearchEngine.Soso)){
+            if(url.contains("p=S")){
+                x = split2(url, "p=S");
+            }
+            else if (url.contains("?w=")){
+                x = split2(url, "\\?w=");
+            }
+            else {
+                x = split2(url, "&w=");
+            }
+            
+            
+        }
+
+        //youdao.com
+        else if(se.equals(SearchEngine.Youdao)){
+            if(url.contains("search?q=")){
+                x = split2(url, "\\?q=");
+            }
+            else if (url.contains("?i=")){
+                x = split2(url, "\\?i=");
+            }
+        }
+  
+ //Russian Search Engines
+        
+        //yandex.ru
+        else if(se.equals(SearchEngine.Yandex)){
+            if(url.contains("?text=")){
+            x = split2(url, "\\?text=");
+            }
+            else{
+                x = split2(url, "&text=");
+            }
+        }
+        
+ //Japanese Search Engines       
+      
+        //biglobe.ne.jp
+        else if(se.equals(SearchEngine.Biglobe)){
+            if(url.contains("?search=")){
+                x = split2(url, "\\?search=");
+            }
+            else if(url.contains("?q=")){
+                x = split2(url, "\\?q=");
+            }
+            else if(url.contains("/key/")){
+                x = split2(url, "/key/");
+            }
+            
+            else if (url.contains("&q=")){
+                x = split2(url, "&q=");
+            }
+        }
+ 
+//Persian & Arabic Search Engines        
+        
+        //Linkestan.com
+        else if(se.equals(SearchEngine.Linkestan)){
+            x = split2(url, "\\?psearch=");
+        }
+        
+        //Parseek.com
+        else if(se.equals(SearchEngine.Parseek)){
+            x = split2(url, "\\?q=");
+        }
+        
+        //Parset.com
+        else if(se.equals(SearchEngine.Parset)){
+            x = split2(url, "\\?Keyword=");
+        }
+        
+        try{ //try to decode the url
+        String decoded = URLDecoder.decode(x, "UTF-8");
+            return decoded;
+        }
+        catch(UnsupportedEncodingException uee){ //if it fails, return the encoded string
+            logger.info("Error during URL decoding: " + uee);
+            return x;
+        }
+  
+    }
+
+    
+//for splitting urls based on a key. Abstracted out of extractSearchEngineQuery()  
+    private String split2(String url, String splitkey){
+        String basereturn = "NULL";
+        String splitKeyConverted = splitkey;
+        //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
+        if(splitkey.contains("\\?")){
+            splitKeyConverted = splitkey.replace("\\?", "?"); //Handling java -> regex conversions and viceversa
+        }
+        if (url.contains(splitKeyConverted)){
+            String[] sp = url.split(splitkey);
+            if(sp.length >= 2){
+                if(sp[sp.length -1].contains("&")){
+                    basereturn = sp[sp.length -1].split("&")[0];
+                    
+                }
+                else{
+                    basereturn = sp[sp.length -1];
+                }
+            }
+        }
+        return basereturn;
+    }
+   
+    
+    
+    private void getURLs(Image image){
+        Collection<BlackboardAttribute> unknownAttr = new ArrayList<BlackboardAttribute>();
+        try{ 
+            //from blackboard_artifacts
+            ArrayList<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (`artifact_type_id` = '2' OR `artifact_type_id` = '4') ");  //List of every 'web_history' and 'bookmark' artifact
+        int y = listArtifacts.size();
+        int z = 1;
+            getAll:    
+        for(BlackboardArtifact artifact : listArtifacts){
+            String query = "";
+            String domain = "";
+            String browser = "";
+            //from tsk_files
+            FsContent fs = this.extractFiles(image, "select * from tsk_files where `obj_id` = '" + artifact.getObjectID() + "'").get(0);
+            SearchEngine se = SearchEngine.NONE;
+            long last_accessed = -1;
+            //from blackboard_attributes
+            ArrayList<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("Where `artifact_id` = " + artifact.getArtifactID());
+            getAttributes:
+            for(BlackboardAttribute attribute : listAttributes){
+                if(attribute.getAttributeTypeID() == 1){
+                    se = getSearchEngine(attribute.getValueString());
+                    if(! se.equals(SearchEngine.NONE)){ 
+                        query = extractSearchEngineQuery(se, attribute.getValueString());
+                        domain = se.toString();
+                        if(query.equals("NULL")){   //False positive match, artifact was not a query.
+                            break getAttributes;
+                        }
+                    }
+                    else if(se.equals(SearchEngine.NONE)){
+                        break getAttributes;    //could not determine type. Will move onto next artifact
+                    }
+                }
+                else if(attribute.getAttributeTypeID() == 4){
+                    browser = attribute.getValueString();
+                }
+                else if(attribute.getAttributeTypeID() == 33){
+                    last_accessed = attribute.getValueLong();
+                }
+            }
+            
+            if(!se.equals(SearchEngine.NONE) && !query.equals("NULL")){
+                try{
+                        Collection<BlackboardAttribute> bbattributes = new ArrayList<BlackboardAttribute>();
+                        bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID(), "SEUQA", "Base URL", domain));
+                        bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT.getTypeID(), "SEUQA", "Extracted search query", query));
+                        bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID(), "SEUQA", "Browser Name", browser));
+                        bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_LAST_ACCESSED.getTypeID(), "SEUQA", "Last Accessed", last_accessed));
+                        this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, fs , bbattributes);
+            }
+                catch(Exception e){
+                    logger.log(Level.SEVERE, "Error while add artifact.", e + " at " + fs.toString());
+                    this.addErrorMessage(this.getName() + ": Error while adding artifact");
+                }
+                IngestManagerProxy.fireServiceDataEvent(new ServiceDataEvent("RecentActivity", BlackboardArtifact.ARTIFACT_TYPE.TSK_TRACKPOINT));
+            }
+            z++;
+            
+        }
+        }
+        catch (Exception e){
+            logger.info("Encountered error retrieving artifacts: " + e);
+        }
+    }
+    
+            
+    @Override
+    public void process(Image image, IngestImageWorkerController controller) {
+        this.getURLs(image);
+    }
+
+    @Override
+    public void init(IngestManagerProxy managerProxy) {
+        throw new UnsupportedOperationException("Not supported yet.");
+    }
+
+    @Override
+    public void complete() {
+        throw new UnsupportedOperationException("Not supported yet.");
+    }
+
+    @Override
+    public void stop() {
+        throw new UnsupportedOperationException("Not supported yet.");
+    }
+
+    @Override
+    public String getName() {
+        return this.moduleName;
+    }
+
+    @Override
+    public String getDescription() {
+        return "Extracts search queries on major search engines";
+    }
+
+    @Override
+    public ServiceType getType() {
+        return ServiceType.Image;
+    }
+
+    @Override
+    public boolean hasBackgroundJobsRunning() {
+        throw new UnsupportedOperationException("Not supported yet.");
+    }
+
+    @Override
+    public boolean hasSimpleConfiguration() {
+        return false;
+    }
+
+    @Override
+    public boolean hasAdvancedConfiguration() {
+        return false;
+    }
+
+    @Override
+    public void saveSimpleConfiguration() {
+       
+    }
+
+    @Override
+    public void saveAdvancedConfiguration() {
+       
+    }
+
+    @Override
+    public JPanel getSimpleConfiguration() {
+        return null;
+    }
+
+    @Override
+    public JPanel getAdvancedConfiguration() {
+        return null;
+    }
+    
+    
+    
+}
--- a/docs/doxygen/regressionTesting.dox
+++ b/docs/doxygen/regressionTesting.dox
@ -74,9 +74,12 @@ Currently, the script supports four argument parameters:

 	-s {imgname}				: runs the test with a single image as specified by {imgname}. Must include the path, it does not read from input.
 	-r or --rebuild			: runs in REBUILD mode, see 5 below
-	-l {config} or --list {config}	: runs with the specified {config} file. If you do not specify a path to the file, it searches for it under ./. It will attempt to search for the file by the specified path name if surrounded in quotes. By default will also search ./input for images in addition. 
-	-u				: runs ignoring unallocated space. Useful for determining framework is operational. Appends "-u" to the output and gold folders for distinguishing them from others. Will automatically compare the right reports.
+	-l {config} or --list {config}	: runs with the specified {config} file. If you do not specify a path to the file, it searches for it under ./. It will attempt to search for the file by the specified path name if surrounded in quotes. By default will also search ./input for images in addition. Appends "-l" to the output and gold folders for distinguishing.
+	-u or --unallocated		: runs ignoring unallocated space. Useful for determining framework is operational. Appends "-u" to the output and gold folders for distinguishing them from others. Will automatically compare the right reports.
 	-i or --ignore			: Will ignore the ./input directory. Use only in combination with -l
+	-d or --delete			: Will not delete the keyword search Solr index upon ingest completion. Will consume more disk space.
+	-v or --verbose			: Prints logged warnings after each ingest
+	-e or --exception		: When followed by a string, will only print out the exceptions that occured that contain the string. Case sensitive.

 These can be run in any combination.