mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
improve performance of highlighting by doing direct query rather than trying to reuse TermsComponentQuery
This commit is contained in:
parent
e3615c75ce
commit
d90e671c34
@ -19,11 +19,11 @@
|
|||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@ -32,11 +32,11 @@ import org.apache.commons.lang.StringUtils;
|
|||||||
import org.apache.solr.client.solrj.SolrQuery;
|
import org.apache.solr.client.solrj.SolrQuery;
|
||||||
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
||||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||||
|
import org.apache.solr.common.SolrDocument;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.Version;
|
import org.sleuthkit.autopsy.coreutils.Version;
|
||||||
import org.sleuthkit.autopsy.datamodel.TextMarkupLookup;
|
import org.sleuthkit.autopsy.datamodel.TextMarkupLookup;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Highlights hits for a given document. Knows about pages and such for the
|
* Highlights hits for a given document. Knows about pages and such for the
|
||||||
@ -145,29 +145,58 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
|
|||||||
*/
|
*/
|
||||||
if (hits == null) {
|
if (hits == null) {
|
||||||
|
|
||||||
String[] keywords = keywordHitQuery.split(" ");
|
String highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
|
||||||
for (String keywordString : keywords) {
|
String query;
|
||||||
Keyword keyword = new Keyword(KeywordSearchUtil.escapeLuceneQuery(keywordString), !isRegex);
|
if (isRegex) {
|
||||||
KeywordSearchQuery chunksQuery = new TermComponentQuery(new KeywordList(Arrays.asList(keyword)), keyword);
|
String[] keywords = keywordHitQuery.split(" ");
|
||||||
chunksQuery.setSubstringQuery();
|
query = Stream.of(keywords).map((String t) -> "/.*" + t + ".*/").collect(Collectors.joining(" "));
|
||||||
chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
|
} else {
|
||||||
try {
|
query = keywordHitQuery;
|
||||||
hits = chunksQuery.performQuery();
|
|
||||||
//organize the hits by page, filter as needed
|
|
||||||
for (Keyword k : hits.getKeywords()) {
|
|
||||||
for (KeywordHit hit : hits.getResults(k)) {
|
|
||||||
int chunkID = hit.getChunkId();
|
|
||||||
if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
|
|
||||||
pagesSorted.add(chunkID);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (NoOpenCoreException ex) {
|
|
||||||
logger.log(Level.INFO, "Could not get chunk info and get highlights", ex); //NON-NLS
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SolrQuery q = new SolrQuery();
|
||||||
|
q.setShowDebugInfo(DEBUG); //debug
|
||||||
|
// input query has already been properly constructed and escaped
|
||||||
|
q.setQuery(highLightField + ":" + query);
|
||||||
|
q.setFields("id");
|
||||||
|
q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.objectId + "_*");
|
||||||
|
|
||||||
|
// //tune the highlighter
|
||||||
|
// q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)
|
||||||
|
// q.setParam("hl.useFastVectorHighlighter", "true"); //fast highlighter scales better than standard one NON-NLS
|
||||||
|
// q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
|
||||||
|
// q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
|
||||||
|
// q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
|
||||||
|
//docs says makes sense for the original Highlighter only, but not really
|
||||||
|
// q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
|
||||||
|
try {
|
||||||
|
QueryResponse response = solrServer.query(q, METHOD.POST);
|
||||||
|
|
||||||
|
Set<SolrDocument> docs = LuceneQuery.filterOneHitPerDocument(response.getResults());
|
||||||
|
for (SolrDocument resultDoc : docs) {
|
||||||
|
final String solrDocumentId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
||||||
|
/**
|
||||||
|
* Parse the Solr document id to get the Solr object id
|
||||||
|
* and chunk id. The Solr object id will either be a
|
||||||
|
* file id or an artifact id from the case database.
|
||||||
|
*
|
||||||
|
* For every object (file or artifact) there will at
|
||||||
|
* least two Solr documents. One contains object
|
||||||
|
* metadata (chunk #1) and the second and subsequent
|
||||||
|
* documents contain chunks of the text.
|
||||||
|
*/
|
||||||
|
final int separatorIndex = solrDocumentId.indexOf(Server.ID_CHUNK_SEP);
|
||||||
|
if (-1 != separatorIndex) {
|
||||||
|
pagesSorted.add(Integer.parseInt(solrDocumentId.substring(separatorIndex + 1)));
|
||||||
|
} else {
|
||||||
|
pagesSorted.add(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (KeywordSearchModuleException | NoOpenCoreException | NumberFormatException ex) {
|
||||||
|
logger.log(Level.WARNING, "Error executing Solr highlighting query: " + keywordHitQuery, ex); //NON-NLS
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
for (Keyword k : hits.getKeywords()) {
|
for (Keyword k : hits.getKeywords()) {
|
||||||
for (KeywordHit hit : hits.getResults(k)) {
|
for (KeywordHit hit : hits.getResults(k)) {
|
||||||
|
@ -27,7 +27,6 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
|
||||||
import org.apache.solr.client.solrj.SolrQuery;
|
import org.apache.solr.client.solrj.SolrQuery;
|
||||||
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
||||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||||
@ -36,6 +35,7 @@ import org.apache.solr.common.SolrDocumentList;
|
|||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.sleuthkit.autopsy.casemodule.Case;
|
import org.sleuthkit.autopsy.casemodule.Case;
|
||||||
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
|
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
|
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
|
||||||
import org.sleuthkit.autopsy.coreutils.Version;
|
import org.sleuthkit.autopsy.coreutils.Version;
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
@ -313,7 +313,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
*
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private Set<SolrDocument> filterOneHitPerDocument(SolrDocumentList resultList) {
|
static Set<SolrDocument> filterOneHitPerDocument(SolrDocumentList resultList) {
|
||||||
// sort the list so that we consistently pick the same chunk each time.
|
// sort the list so that we consistently pick the same chunk each time.
|
||||||
// note this sort is doing a string comparison and not an integer comparison, so
|
// note this sort is doing a string comparison and not an integer comparison, so
|
||||||
// chunk 10 will be smaller than chunk 9.
|
// chunk 10 will be smaller than chunk 9.
|
||||||
@ -481,7 +481,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
* Compares SolrDocuments based on their ID's. Two SolrDocuments with
|
* Compares SolrDocuments based on their ID's. Two SolrDocuments with
|
||||||
* different chunk numbers are considered equal.
|
* different chunk numbers are considered equal.
|
||||||
*/
|
*/
|
||||||
private class SolrDocumentComparatorIgnoresChunkId implements Comparator<SolrDocument> {
|
static private class SolrDocumentComparatorIgnoresChunkId implements Comparator<SolrDocument> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(SolrDocument left, SolrDocument right) {
|
public int compare(SolrDocument left, SolrDocument right) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user