Interim checkin: first draft of artifact keyword search hit artifact generation

2025-07-13 00:16:16 +00:00 · 2015-01-07 17:04:41 -05:00 · 2015-01-07 17:04:41 -05:00 · b992035188
commit b992035188
parent dfe4c27d30
3 changed files with 206 additions and 189 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java
@ -18,45 +18,80 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
 /**
- * Stores the fact that file or an artifact associated had a keyword hit.
+ * Stores the fact that file or an artifact had a keyword hit.
 * <p>
 * Instances of this class are immutable, so they are thread-safe.
 */
 class KeywordHit {
-    private final String documentId;
+    private final String solrDocumentId;
    private final long objectId;
    private final int chunkId;
    private final String snippet;
    private final AbstractFile file;
    BlackboardArtifact artifact;
-    KeywordHit(String documentId, String snippet) {
+    KeywordHit(String solrDocumentId, String snippet) throws TskCoreException {
-        this.documentId = documentId;
+        /**
-        final int separatorIndex = documentId.indexOf(Server.ID_CHUNK_SEP);
+         * Store the Solr document id.
         */
        this.solrDocumentId = solrDocumentId;
        /**
         * Parse the Solr document id to get the object id and chunk id. There
         * will only be a chunk if the text in the object was divided into
         * chunks.
         */
        final int separatorIndex = solrDocumentId.indexOf(Server.ID_CHUNK_SEP);
        if (separatorIndex != -1) {
-            this.objectId = Long.parseLong(documentId.substring(0, separatorIndex));
+            this.objectId = Long.parseLong(solrDocumentId.substring(0, separatorIndex));
-            this.chunkId = Integer.parseInt(documentId.substring(separatorIndex + 1));
+            this.chunkId = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1));
        } else {
-            this.objectId = Long.parseLong(documentId);
+            this.objectId = Long.parseLong(solrDocumentId);
            this.chunkId = 0;
        }
        /**
         * Look up the file associated with the keyword hit. If the high order
         * bit of the object id is set, the hit was for an artifact. In this
         * case, look up the artifact as well.
         */
        SleuthkitCase caseDb = Case.getCurrentCase().getSleuthkitCase();
        long fileId;
        if (this.objectId < 0) {
            long artifactId = this.objectId - 0x8000000000000000L;
            this.artifact = caseDb.getBlackboardArtifact(artifactId);
            fileId = artifact.getObjectID();
        } else {
            fileId = this.objectId;
        }
        this.file = caseDb.getAbstractFileById(fileId);
        /**
         * Store the text snippet.
         */
        this.snippet = snippet;        
    }
-    String getDocumentId() {
+    String getSolrDocumentId() {
-        return this.documentId;
+        return this.solrDocumentId;
    }
    long getObjectId() {
        return this.objectId;
    }
-    int getChunkId() {
+    boolean hasChunkId() {
-        return this.chunkId;
+        return this.chunkId != 0;
    }
-    boolean isChunk() {
+    int getChunkId() {
-        return this.chunkId != 0;
+        return this.chunkId;
    }
    boolean hasSnippet() {
@ -67,8 +102,16 @@ class KeywordHit {
        return this.snippet;
    }
    AbstractFile getFile() {
        return this.file;
    }
    BlackboardArtifact getArtifact() {
        return this.artifact;
    }
    @Override
-    public boolean equals(Object obj) {
+    public boolean equals(Object obj) { // RJCTODO: Fix
        if (null == obj) {
            return false;
        }
@ -80,7 +123,7 @@ class KeywordHit {
    }
    @Override
-    public int hashCode() {
+    public int hashCode() { // RJCTODO: Fix
        int hash = 3;
        hash = 41 * hash + (int) this.objectId + this.chunkId;
        return hash;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2014 Basis Technology Corp.
+ * Copyright 2011-2015 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -21,13 +21,14 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.awt.EventQueue;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.logging.Level;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import javax.swing.SwingWorker;
@ -48,8 +49,6 @@ import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.FsContent;
 import org.sleuthkit.datamodel.TskData.TSK_DB_FILES_TYPE_ENUM;
 /**
 * Node factory that performs the keyword search and creates children nodes for
@ -92,7 +91,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        this.viewer = viewer;
    }
    /**
     * call this at least for the parent Node, to make sure all common
     * properties are displayed as columns (since we are doing lazy child Node
@ -132,86 +130,64 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
            initCommonProperties(map);
            final String query = queryRequest.getQueryString();
            setCommonProperty(map, CommonPropertyTypes.KEYWORD, query);
-            setCommonProperty(map, CommonPropertyTypes.REGEX, Boolean.valueOf(!queryRequest.getQuery().isLiteral()));
+            setCommonProperty(map, CommonPropertyTypes.REGEX, !queryRequest.getQuery().isLiteral());
            createFlatKeys(queryRequest, toPopulate);
        }
        return true;
    }
    /**
     *
     * @param queryRequest
     * @param toPopulate
     * @return
     */
-    protected boolean createFlatKeys(QueryRequest queryRequest, List<KeyValueQueryContent> toPopulate) {
+    private boolean createFlatKeys(QueryRequest queryRequest, List<KeyValueQueryContent> toPopulate) {
        /**
         * Check the validity of the requested query.
         */
        final KeywordSearchQuery keywordSearchQuery = queryRequest.getQuery();
        if (!keywordSearchQuery.validate()) {
            //TODO mark the particular query node RED
            return false;
        }
-        //execute the query and get fscontents matching
+        /**
         * Execute the requested query.
         */
        QueryResults queryResults;
        try {
            queryResults = keywordSearchQuery.performQuery();
        } catch (NoOpenCoreException ex) {
-            logger.log(Level.WARNING, "Could not perform the query. ", ex); //NON-NLS
+            logger.log(Level.SEVERE, "Could not perform the query " + keywordSearchQuery.getQueryString(), ex); //NON-NLS
            return false;
        }
-        
+        int id = 0;
        String listName = queryRequest.getQuery().getKeywordList().getName();
        final boolean literal_query = keywordSearchQuery.isLiteral();
        int resID = 0;
        List<KeyValueQueryContent> tempList = new ArrayList<>();
-        final Map<AbstractFile, KeywordHit> uniqueFileMap = queryResults.getUniqueFiles();
+        for (KeywordHit hit : getOneHitPerObject(queryResults)) {
-        for (final AbstractFile f : uniqueFileMap.keySet()) {
+            /**
             * Get file properties.
             */
            Map<String, Object> properties = new LinkedHashMap<>();
            AbstractFile file = hit.getFile();
            AbstractFsContentNode.fillPropertyMap(properties, file);
            /**
             * Add a snippet property, if available.
             */
            if (hit.hasSnippet()) {
                setCommonProperty(properties, CommonPropertyTypes.CONTEXT, hit.getSnippet());
            }
            //@@@ USE ConentHit in UniqueFileMap instead of the below search
            //get unique match result files
            Map<String, Object> resMap = new LinkedHashMap<>();
            // BC: @@@ THis is really ineffecient.  We should keep track of this when
            // we flattened the list of files to the unique files.            
            final String highlightQueryEscaped = getHighlightQuery(keywordSearchQuery, keywordSearchQuery.isLiteral(), queryResults, file);
-            /* Find a keyword in that file so that we can generate a
+            tempList.add(new KeyValueQueryContent(file.getName(), properties, ++id, file, highlightQueryEscaped, keywordSearchQuery, queryResults));
             * single snippet for it. 
             */
            KeywordHit chit = uniqueFileMap.get(f);
            if (chit.hasSnippet()) {
                setCommonProperty(resMap, CommonPropertyTypes.CONTEXT, chit.getSnippet());
            }
 //            boolean hitFound = false;
 //            for (String hitKey : queryResults.getKeywords()) {
 //                for (ContentHit contentHit : queryResults.getResults(hitKey)) {
 //                    if (contentHit.getContent().equals(f)) {
 //                        hitFound = true;
 //                        if (contentHit.hasSnippet() && (KeywordSearchUtil.escapeLuceneQuery(hitKey) != null)) {
 //                            setCommonProperty(resMap, CommonPropertyTypes.CONTEXT, contentHit.getSnippet());
 //                        }
 //                        break;
 //                    }
 //                }
 //                if (hitFound) {
 //                    break;
 //                }
 //            }
            if (f.getType() == TSK_DB_FILES_TYPE_ENUM.FS) {
                AbstractFsContentNode.fillPropertyMap(resMap, (FsContent) f);
            }
            final String highlightQueryEscaped = getHighlightQuery(keywordSearchQuery, literal_query, queryResults, f);
            tempList.add(new KeyValueQueryContent(f.getName(), resMap, ++resID, f, highlightQueryEscaped, keywordSearchQuery, queryResults));
        }
        // Add all the nodes to toPopulate at once. Minimizes node creation
@ -222,21 +198,36 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        //cannot reuse snippet in BlackboardResultWriter
        //because for regex searches in UI we compress results by showing a file per regex once (even if multiple term hits)
        //whereas in bb we write every hit per file separately
-        new BlackboardResultWriter(queryResults, listName).execute();
+        new BlackboardResultWriter(queryResults, queryRequest.getQuery().getKeywordList().getName()).execute();
        return true;
    }
    List<KeywordHit> getOneHitPerObject(QueryResults queryResults) {
        List<KeywordHit> hits = new ArrayList<>();
        Set<Long> uniqueObjectIds = new HashSet<>();
        for (Keyword keyWord : queryResults.getKeywords()) {
            for (KeywordHit hit : queryResults.getResults(keyWord)) {
                long objectId = hit.getObjectId();
                if (!uniqueObjectIds.contains(objectId)) {
                    uniqueObjectIds.add(objectId);
                    hits.add(hit);
                }
            }
        }
        return hits;
    }
    /**
     * Return the string used to later have SOLR highlight the document with.
     *
     * @param query
     * @param literal_query
     * @param queryResults
-     * @param f
+     * @param file
     * @return
     */
-    private String getHighlightQuery(KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, AbstractFile f) {
+    private String getHighlightQuery(KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, AbstractFile file) {
        String highlightQueryEscaped;
        if (literal_query) {
            //literal, treat as non-regex, non-term component query
@ -253,12 +244,10 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
            } else {
                //find terms for this file hit
                List<String> hitTerms = new ArrayList<>();
-                for (Keyword term : queryResults.getKeywords()) {
+                for (Keyword keyword : queryResults.getKeywords()) {
-                    List<KeywordHit> hitList = queryResults.getResults(term);
+                    for (KeywordHit hit : queryResults.getResults(keyword)) {
-
+                        if (hit.getFile().equals(file)) {
-                    for (KeywordHit h : hitList) {
+                            hitTerms.add(keyword.toString());
                        if (h.getContent().equals(f)) {
                            hitTerms.add(term.toString());
                            break; //go to next term
                        }
                    }
@ -314,7 +303,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        private QueryResults hits;
        private KeywordSearchQuery query;
        /**
         * NOTE Parameters are defined based on how they are currently used in
         * practice
@ -353,7 +341,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        }
    }
    /**
     * worker for writing results to bb, with progress bar, cancellation, and
     * central registry of workers to be stopped when case is closed
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryResults.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryResults.java
@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@ -40,10 +41,12 @@ import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 /**
- * Stores the results from running a SOLR query (which could contain multiple keywords). 
+ * Stores the results from running a SOLR query (which could contain multiple
 * keywords).
 *
 */
 class QueryResults {
    private static final Logger logger = Logger.getLogger(QueryResults.class.getName());
    private KeywordSearchQuery keywordSearchQuery;
@ -78,85 +81,49 @@ class QueryResults {
    }
    /**
-     * Get the unique set of files across all keywords in the results
+     * Creates a blackboard artifacts for the hits. makes one artifact per
-     * @param results
+     * keyword per file (i.e. if a keyword hits several times in the file, only
-     * @return 
+     * one artifact is created)
-     */
+     *
    List<KeywordHit> getUniqueFiles() {
        List<KeywordHit> uniqueHits = new ArrayList<>();
        for (Keyword keyWord : getKeywords()) {
            for (KeywordHit hit : getResults(keyWord)) {
                AbstractFile abstractFile = hit.getContent();
                //flatten, record first chunk encountered
                if (!uniqueHits.containsKey(abstractFile)) {
                    uniqueHits.put(abstractFile, hit);
                }
            }
        }
        return uniqueHits;
    }
    /**
     * Get the unique set of files for a specific keyword
     * @param keyword
     * @return Map of Abstract files and the chunk with the first hit
     */
    Map<AbstractFile, Integer> getUniqueFiles(Keyword keyword) {
        Map<AbstractFile, Integer> ret = new LinkedHashMap<>();
        for (KeywordHit h : getResults(keyword)) {
            AbstractFile f = h.getContent();
            if (!ret.containsKey(f)) {
                ret.put(f, h.getChunkId());
            }
        }
        return ret;
    }
    /**
     * Creates a blackboard artifacts for the hits. makes one artifact per keyword per file (i.e. if a keyword hits several times in teh file, only one artifact is created)
     * @param listName
     * @param progress can be null
     * @param subProgress can be null
-     * @param notifyInbox flag indicating whether or not to call writeSingleFileInboxMessage() for each hit
+     * @param notifyInbox flag indicating whether or not to call
     * writeSingleFileInboxMessage() for each hit
     * @return list of new artifactsPerFile
     */
-    public Collection<BlackboardArtifact> writeAllHitsToBlackBoard(ProgressHandle progress, ProgressContributor subProgress, SwingWorker<Object, Void> worker, boolean notifyInbox) {
+    Collection<BlackboardArtifact> writeAllHitsToBlackBoard(ProgressHandle progress, ProgressContributor subProgress, SwingWorker<Object, Void> worker, boolean notifyInbox) {
        final Collection<BlackboardArtifact> newArtifacts = new ArrayList<>();
        if (progress != null) {
            progress.start(getKeywords().size());
        }
        int unitProgress = 0;
-        for (final Keyword hitTerm : getKeywords()) {           
+        for (final Keyword keyword : getKeywords()) {
            if (worker.isCancelled()) {
-                logger.log(Level.INFO, "Cancel detected, bailing before new keyword processed: {0}", hitTerm.getQuery()); //NON-NLS
+                logger.log(Level.INFO, "Cancel detected, bailing before new keyword processed: {0}", keyword.getQuery()); //NON-NLS
                break;
            }
            // Update progress object(s), if any
            if (progress != null) {
-                progress.progress(hitTerm.toString(), unitProgress);
+                progress.progress(keyword.toString(), unitProgress);
            }
            if (subProgress != null) {
-                String hitDisplayStr = hitTerm.getQuery();
+                String hitDisplayStr = keyword.getQuery();
                if (hitDisplayStr.length() > 50) {
                    hitDisplayStr = hitDisplayStr.substring(0, 49) + "...";
                }
                subProgress.progress(keywordList.getName() + ": " + hitDisplayStr, unitProgress);
            }
-            // this returns the unique files in the set with the first chunk that has a hit
+            for (KeywordHit hit : getOneHitPerObject(keyword)) {
-            Map<AbstractFile, Integer> flattened = getUniqueFiles(hitTerm);
+                String termString = keyword.getQuery();
-            
+                int chunkId = hit.getChunkId();
            for (AbstractFile hitFile : flattened.keySet()) {
                String termString = hitTerm.getQuery();
                int chunkId = flattened.get(hitFile);
                final String snippetQuery = KeywordSearchUtil.escapeLuceneQuery(termString);
                String snippet;
                try {
-                    snippet = LuceneQuery.querySnippet(snippetQuery, hitFile.getId(), chunkId, !keywordSearchQuery.isLiteral(), true);
+                    snippet = LuceneQuery.querySnippet(snippetQuery, hit.getObjectId(), chunkId, !keywordSearchQuery.isLiteral(), true); // RJCTODO: IS this right?
                } catch (NoOpenCoreException e) {
                    logger.log(Level.WARNING, "Error querying snippet: " + snippetQuery, e); //NON-NLS
                    //no reason to continue
@ -166,15 +133,15 @@ class QueryResults {
                    continue;
                }
                if (snippet != null) {
-                    KeywordCachedArtifact writeResult = keywordSearchQuery.writeSingleFileHitsToBlackBoard(termString, hitFile, snippet, keywordList.getName());
+                    KeywordCachedArtifact writeResult = keywordSearchQuery.writeSingleFileHitsToBlackBoard(termString, hit.getFile(), snippet, keywordList.getName()); // RJCTODO: Probably not right
                    if (writeResult != null) {
                        newArtifacts.add(writeResult.getArtifact());
                        if (notifyInbox) {
-                            writeSingleFileInboxMessage(writeResult, hitFile);
+                            writeSingleFileInboxMessage(writeResult, hit.getFile()); // RJCTODO: Probably not right
                        }
                    } else {
-                        logger.log(Level.WARNING, "BB artifact for keyword hit not written, file: {0}, hit: {1}", new Object[]{hitFile, hitTerm.toString()}); //NON-NLS
+                        logger.log(Level.WARNING, "BB artifact for keyword hit not written, file: {0}, hit: {1}", new Object[]{hit.getFile(), keyword.toString()}); //NON-NLS
                    }
                }
            }
@ -189,12 +156,32 @@ class QueryResults {
        return newArtifacts;
    }
    /**
     * RJCTODO: Update Get the unique set of files for a specific keyword
     *
     * @param keyword
     * @return Map of Abstract files and the chunk with the first hit
     */
    private List<KeywordHit> getOneHitPerObject(Keyword keyword) {
        List<KeywordHit> hits = new ArrayList<>();
        Set<Long> uniqueObjectIds = new HashSet<>();
        for (KeywordHit hit : getResults(keyword)) {
            long objectId = hit.getObjectId();
            if (!uniqueObjectIds.contains(objectId)) {
                uniqueObjectIds.add(objectId);
                hits.add(hit);
            }
        }
        return hits;
    }
    /**
     * Generate an ingest inbox message for given keyword in given file
     *
     * @param written
     * @param hitFile
     */
-    public void writeSingleFileInboxMessage(KeywordCachedArtifact written, AbstractFile hitFile) {
+    private void writeSingleFileInboxMessage(KeywordCachedArtifact written, AbstractFile hitFile) {
        StringBuilder subjectSb = new StringBuilder();
        StringBuilder detailsSb = new StringBuilder();