From 671a2d1e680441e291337e77bd0bcc7a3c7e6ec4 Mon Sep 17 00:00:00 2001 From: Brian Carrier Date: Tue, 12 May 2015 13:10:54 -0400 Subject: [PATCH] sort list of docs based on ID before picking unique chunk --- .../autopsy/keywordsearch/LuceneQuery.java | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java index fb70256b4a..72e564c11b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java @@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; @@ -214,7 +215,7 @@ class LuceneQuery implements KeywordSearchQuery { Map>> highlightResponse = response.getHighlighting(); // get the unique set of files with hits - Set uniqueSolrDocumentsWithHits = filterDuplicateSolrDocuments(resultList); + Set uniqueSolrDocumentsWithHits = filterOneHitPerDocument(resultList); allMatchesFetched = start + MAX_RESULTS >= resultList.getNumFound(); @@ -305,7 +306,24 @@ class LuceneQuery implements KeywordSearchQuery { * @param resultList * @return */ - private Set filterDuplicateSolrDocuments(SolrDocumentList resultList) { + private Set filterOneHitPerDocument(SolrDocumentList resultList) { + // sort the list so that we consistently pick the same chunk each time. + // note this sort is doing a string comparison and not an integer comparison, so + // chunk 10 will be smaller than chunk 9. + Collections.sort(resultList, new Comparator() { + @Override + public int compare(SolrDocument left, SolrDocument right) { + // ID is in the form of ObjectId_Chunk + String leftID = left.getFieldValue(Server.Schema.ID.toString()).toString(); + String rightID = right.getFieldValue(Server.Schema.ID.toString()).toString(); + return leftID.compareTo(rightID); + } + }); + + // NOTE: We could probably just iterate through the list and compare each ID with the + // previous ID to get the unique documents faster than using this set now that the list + // is sorted. + Set solrDocumentsWithMatches = new TreeSet<>(new SolrDocumentComparatorIgnoresChunkId()); solrDocumentsWithMatches.addAll(resultList); return solrDocumentsWithMatches; @@ -464,24 +482,26 @@ class LuceneQuery implements KeywordSearchQuery { public int compare(SolrDocument left, SolrDocument right) { // ID is in the form of ObjectId_Chunk - String idName = Server.Schema.ID.toString(); + final String idName = Server.Schema.ID.toString(); + + // get object id of left doc String leftID = left.getFieldValue(idName).toString(); int index = leftID.indexOf(Server.ID_CHUNK_SEP); if (index != -1) { leftID = leftID.substring(0, index); } + // get object id of right doc String rightID = right.getFieldValue(idName).toString(); index = rightID.indexOf(Server.ID_CHUNK_SEP); if (index != -1) { rightID = rightID.substring(0, index); } - - if(Integer.parseInt(leftID) < Integer.parseInt(rightID)) - return -1; - if(Integer.parseInt(leftID) > Integer.parseInt(rightID)) - return 1; - return 0; + + Integer leftInt = new Integer(leftID); + Integer rightInt = new Integer(rightID); + return leftInt.compareTo(rightInt); } } + }