From d90e671c3444aa4dabc286f8ca91a1949537d69f Mon Sep 17 00:00:00 2001 From: jmillman Date: Mon, 25 Jul 2016 16:34:39 -0400 Subject: [PATCH] improve performance of highlighting by doing direct query rather than trying to reuse TermsComponentQuery --- .../keywordsearch/HighlightedText.java | 77 +++++++++++++------ .../autopsy/keywordsearch/LuceneQuery.java | 6 +- 2 files changed, 56 insertions(+), 27 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java index 1d931d77e7..965df03ece 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java @@ -19,11 +19,11 @@ package org.sleuthkit.autopsy.keywordsearch; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeSet; import java.util.logging.Level; import java.util.stream.Collectors; @@ -32,11 +32,11 @@ import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrRequest.METHOD; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Version; import org.sleuthkit.autopsy.datamodel.TextMarkupLookup; -import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType; /** * Highlights hits for a given document. Knows about pages and such for the @@ -145,29 +145,58 @@ class HighlightedText implements IndexedText, TextMarkupLookup { */ if (hits == null) { - String[] keywords = keywordHitQuery.split(" "); - for (String keywordString : keywords) { - Keyword keyword = new Keyword(KeywordSearchUtil.escapeLuceneQuery(keywordString), !isRegex); - KeywordSearchQuery chunksQuery = new TermComponentQuery(new KeywordList(Arrays.asList(keyword)), keyword); - chunksQuery.setSubstringQuery(); - chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId)); - try { - hits = chunksQuery.performQuery(); - //organize the hits by page, filter as needed - for (Keyword k : hits.getKeywords()) { - for (KeywordHit hit : hits.getResults(k)) { - int chunkID = hit.getChunkId(); - if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) { - pagesSorted.add(chunkID); - } - } - } - - } catch (NoOpenCoreException ex) { - logger.log(Level.INFO, "Could not get chunk info and get highlights", ex); //NON-NLS - return; - } + String highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX; + String query; + if (isRegex) { + String[] keywords = keywordHitQuery.split(" "); + query = Stream.of(keywords).map((String t) -> "/.*" + t + ".*/").collect(Collectors.joining(" ")); + } else { + query = keywordHitQuery; } + + SolrQuery q = new SolrQuery(); + q.setShowDebugInfo(DEBUG); //debug + // input query has already been properly constructed and escaped + q.setQuery(highLightField + ":" + query); + q.setFields("id"); + q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.objectId + "_*"); + +// //tune the highlighter +// q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema) +// q.setParam("hl.useFastVectorHighlighter", "true"); //fast highlighter scales better than standard one NON-NLS +// q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS +// q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS +// q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS + //docs says makes sense for the original Highlighter only, but not really +// q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS + try { + QueryResponse response = solrServer.query(q, METHOD.POST); + + Set docs = LuceneQuery.filterOneHitPerDocument(response.getResults()); + for (SolrDocument resultDoc : docs) { + final String solrDocumentId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString(); + /** + * Parse the Solr document id to get the Solr object id + * and chunk id. The Solr object id will either be a + * file id or an artifact id from the case database. + * + * For every object (file or artifact) there will at + * least two Solr documents. One contains object + * metadata (chunk #1) and the second and subsequent + * documents contain chunks of the text. + */ + final int separatorIndex = solrDocumentId.indexOf(Server.ID_CHUNK_SEP); + if (-1 != separatorIndex) { + pagesSorted.add(Integer.parseInt(solrDocumentId.substring(separatorIndex + 1))); + } else { + pagesSorted.add(0); + } + } + + } catch (KeywordSearchModuleException | NoOpenCoreException | NumberFormatException ex) { + logger.log(Level.WARNING, "Error executing Solr highlighting query: " + keywordHitQuery, ex); //NON-NLS + } + } else { for (Keyword k : hits.getKeywords()) { for (KeywordHit hit : hits.getResults(k)) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java index cda4f8692b..26313f5776 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java @@ -27,7 +27,6 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.logging.Level; -import org.sleuthkit.autopsy.coreutils.Logger; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrRequest.METHOD; import org.apache.solr.client.solrj.response.QueryResponse; @@ -36,6 +35,7 @@ import org.apache.solr.common.SolrDocumentList; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.coreutils.EscapeUtil; +import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil; import org.sleuthkit.autopsy.coreutils.Version; import org.sleuthkit.datamodel.BlackboardArtifact; @@ -313,7 +313,7 @@ class LuceneQuery implements KeywordSearchQuery { * * @return */ - private Set filterOneHitPerDocument(SolrDocumentList resultList) { + static Set filterOneHitPerDocument(SolrDocumentList resultList) { // sort the list so that we consistently pick the same chunk each time. // note this sort is doing a string comparison and not an integer comparison, so // chunk 10 will be smaller than chunk 9. @@ -481,7 +481,7 @@ class LuceneQuery implements KeywordSearchQuery { * Compares SolrDocuments based on their ID's. Two SolrDocuments with * different chunk numbers are considered equal. */ - private class SolrDocumentComparatorIgnoresChunkId implements Comparator { + static private class SolrDocumentComparatorIgnoresChunkId implements Comparator { @Override public int compare(SolrDocument left, SolrDocument right) {