Only perform one query to solr for hits and snippets

2025-07-19 11:07:43 +00:00 · 2013-12-06 14:47:53 -05:00 · 2013-12-06 14:47:53 -05:00 · ec24c347bc
commit ec24c347bc
parent 1f932aca22
3 changed files with 140 additions and 47 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentHit.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentHit.java
@ -31,6 +31,8 @@ public class ContentHit {

    private AbstractFile content;
    private int chunkID = 0;
+    private String snippet = "";
+    private boolean snippetSet = false;

    ContentHit(AbstractFile content) {
        this.content = content;
@ -57,6 +59,20 @@ public class ContentHit {
        return chunkID != 0;
    }

+    ContentHit setSnippet(String snippet) {
+        this.snippet = snippet;
+        this.snippetSet = true;
+        return this;
+    }
+    
+    String getSnippet() {
+        return snippet;
+    }
+    
+    boolean hasSnippet() {
+        return snippetSet;
+    }
+    
    @Override
    public boolean equals(Object obj) {
        if (obj == null) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
@ -245,7 +245,7 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueQuery> {
                logger.log(Level.WARNING, "Could not perform the query. ", ex);
                return false;
            }
-            final Map<AbstractFile, Integer> hitContents = ContentHit.flattenResults(tcqRes);
+//            final Map<AbstractFile, Integer> hitContents = ContentHit.flattenResults(tcqRes);

            //get listname
            String listName = "";
@ -257,55 +257,64 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueQuery> {
            final boolean literal_query = tcq.isEscaped();

            int resID = 0;
-            for (final AbstractFile f : hitContents.keySet()) {
-                final int previewChunk = hitContents.get(f);
+//            for (final AbstractFile f : hitContents.keySet()) {
+            for(ContentHit chit : tcqRes.get(tcq.getQueryString())) {
+                AbstractFile f = chit.getContent();
+//                final int previewChunk = hitContents.get(f);
                //get unique match result files
                Map<String, Object> resMap = new LinkedHashMap<>();

-                try {
-                    String snippet;
-                    
-                    String snippetQuery = null;
-
-                    if (literal_query) {
-                        snippetQuery = tcq.getEscapedQueryString();
-                    } else {
-                        //in regex, to generate the preview snippet
-                        //just pick any term that hit that file (since we are compressing result view)
-                        String hit = null;
-                        //find the first hit for this file 
-                        for (String hitKey : tcqRes.keySet()) {
-                            List<ContentHit> chits = tcqRes.get(hitKey);
-                            for (ContentHit chit : chits) {
-                                if (chit.getContent().equals(f)) {
-                                    hit = hitKey;
-                                    break;
-                                }
-                            }
-                            if (hit != null) {
-                                break;
-                            }
-                        }
-                        if (hit != null) {
-                            snippetQuery = KeywordSearchUtil.escapeLuceneQuery(hit);
-                        }
-                    }
-
-                    if (snippetQuery != null) {
-                        snippet = LuceneQuery.querySnippet(snippetQuery, f.getId(), previewChunk, !literal_query, true);
-                        setCommonProperty(resMap, CommonPropertyTypes.CONTEXT, snippet);
-                    }
-                } catch (NoOpenCoreException ex) {
-                    logger.log(Level.WARNING, "Could not perform the snippet query. ", ex);
-                    return false;
+                if (chit.hasSnippet()) {
+                    setCommonProperty(resMap, CommonPropertyTypes.CONTEXT, chit.getSnippet());
                }
+                
+//                if (f.getSize() < 10000000) {
+//                    try {
+//                        String snippet;
+//
+//                        String snippetQuery = null;
+//
+//                        if (literal_query) {
+//                            snippetQuery = tcq.getEscapedQueryString();
+//                        } else {
+//                            //in regex, to generate the preview snippet
+//                            //just pick any term that hit that file (since we are compressing result view)
+//                            String hit = null;
+//                            //find the first hit for this file 
+//                            for (String hitKey : tcqRes.keySet()) {
+//                                List<ContentHit> chits = tcqRes.get(hitKey);
+//                                for (ContentHit chit : chits) {
+//                                    if (chit.getContent().equals(f)) {
+//                                        hit = hitKey;
+//                                        break;
+//                                    }
+//                                }
+//                                if (hit != null) {
+//                                    break;
+//                                }
+//                            }
+//                            if (hit != null) {
+//                                snippetQuery = KeywordSearchUtil.escapeLuceneQuery(hit);
+//                            }
+//                        }
+//
+//                        if (snippetQuery != null) {
+//                            snippet = LuceneQuery.querySnippet(snippetQuery, f.getId(), previewChunk, !literal_query, true);
+//                            setCommonProperty(resMap, CommonPropertyTypes.CONTEXT, snippet);
+//                        }
+//                    } catch (NoOpenCoreException ex) {
+//                        logger.log(Level.WARNING, "Could not perform the snippet query. ", ex);
+//                        return false;
+//                    }
+//                }

                if (f.getType() == TSK_DB_FILES_TYPE_ENUM.FS) {
                    AbstractFsContentNode.fillPropertyMap(resMap, (FsContent) f);
                }

                final String highlightQueryEscaped = getHighlightQuery(tcq, literal_query, tcqRes, f);
-                tempList.add(new KeyValueQueryContent(f.getName(), resMap, ++resID, f, highlightQueryEscaped, tcq, previewChunk, tcqRes));
+//                tempList.add(new KeyValueQueryContent(f.getName(), resMap, ++resID, f, highlightQueryEscaped, tcq, previewChunk, tcqRes));
+                tempList.add(new KeyValueQueryContent(f.getName(), resMap, ++resID, f, highlightQueryEscaped, tcq, chit.getChunkId(), tcqRes));
            }
            
            // Add all the nodes to toPopulate at once. Minimizes node creation
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@ -20,9 +20,12 @@ package org.sleuthkit.autopsy.keywordsearch;

 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.apache.solr.client.solrj.SolrQuery;
@ -129,7 +132,7 @@ public class LuceneQuery implements KeywordSearchQuery {
    public Map<String, List<ContentHit>> performQuery() throws NoOpenCoreException {
        Map<String, List<ContentHit>> results = new HashMap<String, List<ContentHit>>();
        //in case of single term literal query there is only 1 term
-        results.put(keywordString, performLuceneQuery());
+        results.put(keywordString, performLuceneQuery(false));

        return results;
    }
@ -192,7 +195,7 @@ public class LuceneQuery implements KeywordSearchQuery {
     * @return list of ContentHit objects
     * @throws NoOpenCoreException
     */
-    private List<ContentHit> performLuceneQuery() throws NoOpenCoreException {
+    private List<ContentHit> performLuceneQuery(boolean snippets) throws NoOpenCoreException {

        List<ContentHit> matches = new ArrayList<ContentHit>();

@ -215,10 +218,35 @@ public class LuceneQuery implements KeywordSearchQuery {
        
        q.setQuery(theQueryStr);
        q.setRows(MAX_RESULTS);
-        q.setFields(Server.Schema.ID.toString());
+        if (snippets) {
+            q.setFields(Server.Schema.ID.toString(), Server.Schema.CONTENT.toString());
+        } else {
+            q.setFields(Server.Schema.ID.toString());
+        }
        for (KeywordQueryFilter filter : filters) {
            q.addFilterQuery(filter.toString());
        }
+        
+        if (snippets) {
+            q.addHighlightField(Server.Schema.CONTENT.toString());
+            //q.setHighlightSimplePre("&laquo;"); //original highlighter only
+            //q.setHighlightSimplePost("&raquo;");  //original highlighter only
+            q.setHighlightSnippets(1);
+            q.setHighlightFragsize(SNIPPET_LENGTH);
+
+            //tune the highlighter
+            q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one
+            q.setParam("hl.tag.pre", "&laquo;"); //makes sense for FastVectorHighlighter only
+            q.setParam("hl.tag.post", "&laquo;"); //makes sense for FastVectorHighlighter only
+            q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only
+
+             //Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
+            q.setParam("hl.fragCharSize", Integer.toString(theQueryStr.length())); //makes sense for FastVectorHighlighter only
+
+            //docs says makes sense for the original Highlighter only, but not really
+            //analyze all content SLOW! consider lowering
+            q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED);
+        }

        for (int start = 0; !allMatchesFetched; start = start + MAX_RESULTS) {
            q.setStart(start);
@ -226,8 +254,31 @@ public class LuceneQuery implements KeywordSearchQuery {
            try {
                QueryResponse response = solrServer.query(q, METHOD.POST);
                SolrDocumentList resultList = response.getResults();
+                Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
                long results = resultList.getNumFound();
+                Set<SolrDocument> solrDocumentsWithMatches = new TreeSet<>( 
+                        new Comparator<SolrDocument>() {
+                            @Override
+                            public int compare(SolrDocument left, SolrDocument right) {
+                                String idName = Server.Schema.ID.toString();
+                                String leftID = left.getFieldValue(idName).toString();
+                                int index = leftID.indexOf(Server.ID_CHUNK_SEP);
+                                if (index != -1) {
+                                    leftID = leftID.substring(0, index);
+                                }
+                                
+                                String rightID = right.getFieldValue(idName).toString();
+                                index = rightID.indexOf(Server.ID_CHUNK_SEP);
+                                if (index != -1) {
+                                    rightID = rightID.substring(0, index);
+                                }
+                                
+                                return leftID.compareTo(rightID);
+                            }
+                });
+                solrDocumentsWithMatches.addAll(resultList);
                allMatchesFetched = start + MAX_RESULTS >= results;
+                
                SleuthkitCase sc = null;
                try {
                    sc = Case.getCurrentCase().getSleuthkitCase();
@ -236,11 +287,21 @@ public class LuceneQuery implements KeywordSearchQuery {
                    return matches;
                }

-                for (SolrDocument resultDoc : resultList) {
+                for (SolrDocument resultDoc : solrDocumentsWithMatches) {
                    final String resultID = (String) resultDoc.getFieldValue(Server.Schema.ID.toString());

                    final int sepIndex = resultID.indexOf(Server.ID_CHUNK_SEP);

+                    String snippet = "";
+                    if (snippets) {
+                        try {
+                            snippet = highlightResponse.get(resultID).get(Server.Schema.CONTENT.toString()).get(0);
+                            snippet = EscapeUtil.unEscapeHtml(snippet).trim();
+                        } catch (NullPointerException ex) {
+                            snippet = "";
+                        }
+                    }
+                            
                    if (sepIndex != -1) {
                        //file chunk result
                        final long fileID = Long.parseLong(resultID.substring(0, sepIndex));
@ -249,8 +310,11 @@ public class LuceneQuery implements KeywordSearchQuery {

                        try {
                            AbstractFile resultAbstractFile = sc.getAbstractFileById(fileID);
-                            matches.add(new ContentHit(resultAbstractFile, chunkId));
-
+                            ContentHit chit = new ContentHit(resultAbstractFile, chunkId);
+                            if (snippet.isEmpty() == false) {
+                                chit.setSnippet(snippet);
+                            }
+                            matches.add(chit);
                        } catch (TskException ex) {
                            logger.log(Level.WARNING, "Could not get the AbstractFile for keyword hit, ", ex);
                            //something wrong with case/db
@ -262,7 +326,11 @@ public class LuceneQuery implements KeywordSearchQuery {

                        try {
                            AbstractFile resultAbstractFile = sc.getAbstractFileById(fileID);
-                            matches.add(new ContentHit(resultAbstractFile));
+                            ContentHit chit = new ContentHit(resultAbstractFile);
+                            if (snippet.isEmpty() == false) {
+                                chit.setSnippet(snippet);
+                            }
+                            matches.add(chit);
                        } catch (TskException ex) {
                            logger.log(Level.WARNING, "Could not get the AbstractFile for keyword hit, ", ex);
                            //something wrong with case/db