much simpler version that also works for hits that are not single terms in the Solr index.

This commit is contained in:
millmanorama 2017-01-23 22:28:43 +01:00
parent bcb97180c4
commit c616b45b33

View File

@ -30,7 +30,6 @@ import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.sleuthkit.autopsy.coreutils.EscapeUtil; import org.sleuthkit.autopsy.coreutils.EscapeUtil;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.Version; import org.sleuthkit.autopsy.coreutils.Version;
@ -203,11 +202,11 @@ class LuceneQuery implements KeywordSearchQuery {
response = solrServer.query(q, METHOD.POST); response = solrServer.query(q, METHOD.POST);
resultList = response.getResults(); resultList = response.getResults();
SimpleOrderedMap<?> termVectors = (SimpleOrderedMap) response.getResponse().get("termVectors");
// objectId_chunk -> "text" -> List of previews // objectId_chunk -> "text" -> List of previews
highlightResponse = response.getHighlighting(); highlightResponse = response.getHighlighting();
final String strippedQueryString = StringUtils.strip(getQueryString(), "\"");
// cycle through results in sets of MAX_RESULTS // cycle through results in sets of MAX_RESULTS
for (int start = 0; !allMatchesFetched; start = start + MAX_RESULTS) { for (int start = 0; !allMatchesFetched; start = start + MAX_RESULTS) {
q.setStart(start); q.setStart(start);
@ -222,30 +221,20 @@ class LuceneQuery implements KeywordSearchQuery {
* will get picked up in the next one. */ * will get picked up in the next one. */
final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString(); final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
final String content_str = resultDoc.get(Server.Schema.CONTENT_STR.toString()).toString();
Integer startOffset = getFirstOffset(termVectors, docId); Integer firstOccurence = content_str.indexOf(strippedQueryString);
if (startOffset < chunkSize) { if (firstOccurence < chunkSize) {
matches.add(createKeywordtHit(highlightResponse, docId)); matches.add(createKeywordtHit(highlightResponse, docId));
} }
} catch (TskException ex) { } catch (TskException ex) {
return matches; return matches;
} }
} }
} }
return matches; return matches;
} }
private Integer getFirstOffset(SimpleOrderedMap<?> termVectors, final String docId) {
SimpleOrderedMap<?> docTermVector = (SimpleOrderedMap<?>) termVectors.get(docId);
SimpleOrderedMap<?> fieldVector = (field == null)
? (SimpleOrderedMap< ?>) docTermVector.get(Server.Schema.TEXT.toString())
: (SimpleOrderedMap<?>) docTermVector.get(field);
SimpleOrderedMap<?> termInfo = (SimpleOrderedMap<?>) fieldVector.get(StringUtils.strip(getQueryString().toLowerCase(), "\""));
SimpleOrderedMap<?> offsets = (SimpleOrderedMap<?>) termInfo.get("offsets");
return (Integer) offsets.get("start");
}
/** /**
* Create the query object for the stored keyword * Create the query object for the stored keyword
* *
@ -270,25 +259,14 @@ class LuceneQuery implements KeywordSearchQuery {
q.setQuery(theQueryStr); q.setQuery(theQueryStr);
q.setRows(MAX_RESULTS); q.setRows(MAX_RESULTS);
q.setFields(Server.Schema.ID.toString(),
Server.Schema.CHUNK_SIZE.toString(),
Server.Schema.CONTENT_STR.toString());
q.addSort(Server.Schema.ID.toString(), SolrQuery.ORDER.asc); q.addSort(Server.Schema.ID.toString(), SolrQuery.ORDER.asc);
for (KeywordQueryFilter filter : filters) { for (KeywordQueryFilter filter : filters) {
q.addFilterQuery(filter.toString()); q.addFilterQuery(filter.toString());
} }
//use Term Vector Request Handler to get term vectors with offsets.
//They are used to exclude hits that start inside the chunk window.
q.setRequestHandler("/tvrh");
q.setParam("tv", true);
q.setParam("tv.df", false);
q.setParam("tv.offsets", true);
q.setParam("tv.positions", false);
q.setParam("tv.payloads", false);
q.setParam("tv.tf", false);
q.setParam("tv.tf_idf", false);
if (field != null) {
q.setParam("tv.fl", field);
}
if (snippets) { if (snippets) {
q.addHighlightField(Server.Schema.TEXT.toString()); q.addHighlightField(Server.Schema.TEXT.toString());
//q.setHighlightSimplePre("&laquo;"); //original highlighter only //q.setHighlightSimplePre("&laquo;"); //original highlighter only