From d90e671c3444aa4dabc286f8ca91a1949537d69f Mon Sep 17 00:00:00 2001
From: jmillman <jmillman@basistech.com>
Date: Mon, 25 Jul 2016 16:34:39 -0400
Subject: [PATCH] improve performance of highlighting by doing direct query
 rather than trying to reuse TermsComponentQuery

---
 .../keywordsearch/HighlightedText.java        | 77 +++++++++++++------
 .../autopsy/keywordsearch/LuceneQuery.java    |  6 +-
 2 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
index 1d931d77e7..965df03ece 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
@@ -19,11 +19,11 @@
 package org.sleuthkit.autopsy.keywordsearch;
 
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeSet;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
@@ -32,11 +32,11 @@ import org.apache.commons.lang.StringUtils;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest.METHOD;
 import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.Version;
 import org.sleuthkit.autopsy.datamodel.TextMarkupLookup;
-import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;
 
 /**
  * Highlights hits for a given document. Knows about pages and such for the
@@ -145,29 +145,58 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
              */
             if (hits == null) {
 
-                String[] keywords = keywordHitQuery.split(" ");
-                for (String keywordString : keywords) {
-                    Keyword keyword = new Keyword(KeywordSearchUtil.escapeLuceneQuery(keywordString), !isRegex);
-                    KeywordSearchQuery chunksQuery = new TermComponentQuery(new KeywordList(Arrays.asList(keyword)), keyword);
-                    chunksQuery.setSubstringQuery();
-                    chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
-                    try {
-                        hits = chunksQuery.performQuery();
-                        //organize the hits by page, filter as needed
-                        for (Keyword k : hits.getKeywords()) {
-                            for (KeywordHit hit : hits.getResults(k)) {
-                                int chunkID = hit.getChunkId();
-                                if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
-                                    pagesSorted.add(chunkID);
-                                }
-                            }
-                        }
-
-                    } catch (NoOpenCoreException ex) {
-                        logger.log(Level.INFO, "Could not get chunk info and get highlights", ex); //NON-NLS
-                        return;
-                    }
+                String highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
+                String query;
+                if (isRegex) {
+                    String[] keywords = keywordHitQuery.split(" ");
+                    query = Stream.of(keywords).map((String t) -> "/.*" + t + ".*/").collect(Collectors.joining(" "));
+                } else {
+                    query = keywordHitQuery;
                 }
+
+                SolrQuery q = new SolrQuery();
+                q.setShowDebugInfo(DEBUG); //debug
+                // input query has already been properly constructed and escaped
+                q.setQuery(highLightField + ":" + query);
+                q.setFields("id");
+                q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.objectId + "_*");
+
+//                //tune the highlighter
+//                q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)
+//                q.setParam("hl.useFastVectorHighlighter", "true"); //fast highlighter scales better than standard one NON-NLS
+//                q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
+//                q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
+//                q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
+                //docs says makes sense for the original Highlighter only, but not really
+//                q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
+                try {
+                    QueryResponse response = solrServer.query(q, METHOD.POST);
+
+                    Set<SolrDocument> docs = LuceneQuery.filterOneHitPerDocument(response.getResults());
+                    for (SolrDocument resultDoc : docs) {
+                        final String solrDocumentId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
+                        /**
+                         * Parse the Solr document id to get the Solr object id
+                         * and chunk id. The Solr object id will either be a
+                         * file id or an artifact id from the case database.
+                         *
+                         * For every object (file or artifact) there will at
+                         * least two Solr documents. One contains object
+                         * metadata (chunk #1) and the second and subsequent
+                         * documents contain chunks of the text.
+                         */
+                        final int separatorIndex = solrDocumentId.indexOf(Server.ID_CHUNK_SEP);
+                        if (-1 != separatorIndex) {
+                            pagesSorted.add(Integer.parseInt(solrDocumentId.substring(separatorIndex + 1)));
+                        } else {
+                            pagesSorted.add(0);
+                        }
+                    }
+
+                } catch (KeywordSearchModuleException | NoOpenCoreException | NumberFormatException ex) {
+                    logger.log(Level.WARNING, "Error executing Solr highlighting query: " + keywordHitQuery, ex); //NON-NLS
+                }
+
             } else {
                 for (Keyword k : hits.getKeywords()) {
                     for (KeywordHit hit : hits.getResults(k)) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
index cda4f8692b..26313f5776 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@@ -27,7 +27,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.logging.Level;
-import org.sleuthkit.autopsy.coreutils.Logger;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest.METHOD;
 import org.apache.solr.client.solrj.response.QueryResponse;
@@ -36,6 +35,7 @@ import org.apache.solr.common.SolrDocumentList;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.EscapeUtil;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
 import org.sleuthkit.autopsy.coreutils.Version;
 import org.sleuthkit.datamodel.BlackboardArtifact;
@@ -313,7 +313,7 @@ class LuceneQuery implements KeywordSearchQuery {
      *
      * @return
      */
-    private Set<SolrDocument> filterOneHitPerDocument(SolrDocumentList resultList) {
+    static Set<SolrDocument> filterOneHitPerDocument(SolrDocumentList resultList) {
         // sort the list so that we consistently pick the same chunk each time.
         // note this sort is doing a string comparison and not an integer comparison, so 
         // chunk 10 will be smaller than chunk 9. 
@@ -481,7 +481,7 @@ class LuceneQuery implements KeywordSearchQuery {
      * Compares SolrDocuments based on their ID's. Two SolrDocuments with
      * different chunk numbers are considered equal.
      */
-    private class SolrDocumentComparatorIgnoresChunkId implements Comparator<SolrDocument> {
+    static private class SolrDocumentComparatorIgnoresChunkId implements Comparator<SolrDocument> {
 
         @Override
         public int compare(SolrDocument left, SolrDocument right) {