finish implementing all the cases for highlighting

This commit is contained in:
millmanorama 2017-02-15 13:48:43 +01:00
parent edd03a66c1
commit 6abe26d6de
5 changed files with 102 additions and 156 deletions

View File

@ -110,8 +110,7 @@ public class ExtractedContentViewer implements DataContentViewer {
BlackboardArtifact artifact = nodeLookup.lookup(BlackboardArtifact.class);
if (hits != null) {
highlightedHitText = new HighlightedText(content.getId(), hits);
} else {
if (artifact != null && artifact.getArtifactTypeID()
} else if (artifact != null && artifact.getArtifactTypeID()
== BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) {
// if the artifact is an account artifact, get an account text .
highlightedHitText = getAccountsText(content, nodeLookup);
@ -119,7 +118,6 @@ public class ExtractedContentViewer implements DataContentViewer {
== BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT.getTypeID()) {
highlightedHitText = new HighlightedText(artifact);
}
}
if (highlightedHitText != null) {
indexedTextSources.add(highlightedHitText);
}
@ -298,16 +296,16 @@ public class ExtractedContentViewer implements DataContentViewer {
return false;
}
/**
* Is there any marked up indexed text in the look up of this node? This
* will be the case if the node is for a keyword hit artifact produced
* by either an ad hoc keyword search result (keyword search toolbar
* widgets) or a keyword search by the keyword search ingest module.
*/
Collection<? extends IndexedText> sources = node.getLookup().lookupAll(IndexedText.class);
if (sources.isEmpty() == false) {
return true;
}
// /**
// * Is there any marked up indexed text in the look up of this node? This
// * will be the case if the node is for a keyword hit artifact produced
// * by either an ad hoc keyword search result (keyword search toolbar
// * widgets) or a keyword search by the keyword search ingest module.
// */
// Collection<? extends IndexedText> sources = node.getLookup().lookupAll(IndexedText.class);
// if (sources.isEmpty() == false) {
// return true;
// }
/*
* Is there a credit card artifact in the lookup
@ -315,7 +313,9 @@ public class ExtractedContentViewer implements DataContentViewer {
Collection<? extends BlackboardArtifact> artifacts = node.getLookup().lookupAll(BlackboardArtifact.class);
if (artifacts != null) {
for (BlackboardArtifact art : artifacts) {
if (art.getArtifactTypeID() == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) {
final int artifactTypeID = art.getArtifactTypeID();
if (artifactTypeID == TSK_ACCOUNT.getTypeID()
|| artifactTypeID == TSK_KEYWORD_HIT.getTypeID()) {
return true;
}
}

View File

@ -18,7 +18,6 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import com.ibm.icu.text.UnicodeSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@ -28,7 +27,6 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.logging.Level;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringEscapeUtils;
@ -40,14 +38,13 @@ import org.apache.solr.common.SolrDocumentList;
import org.openide.util.Exceptions;
import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
import org.sleuthkit.autopsy.coreutils.Version;
import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;
import org.sleuthkit.autopsy.keywordsearch.KeywordSearch.QueryType;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
/**
@ -89,6 +86,7 @@ class HighlightedText implements IndexedText {
private boolean isPageInfoLoaded = false;
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
private BlackboardArtifact artifact;
private KeywordSearch.QueryType qt;
/**
* This constructor is used when keyword hits are accessed from the ad-hoc
@ -123,16 +121,24 @@ class HighlightedText implements IndexedText {
}
private void loadPageInfoFromArtifact() throws TskCoreException, NumberFormatException {
final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
this.keywords.add(keyword);
KeywordSearch.QueryType qt = KeywordSearch.QueryType.values()[artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE).getValueInt()];
this.keywords.add(artifact.getAttribute(TSK_KEYWORD).getValueString());
String chunkIDsString = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS).getValueString();
final BlackboardAttribute qtAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
qt = (qtAttribute != null)
? KeywordSearch.QueryType.values()[qtAttribute.getValueInt()] : null;
final BlackboardAttribute docIDsArtifact = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS);
if (qt == QueryType.REGEX && docIDsArtifact != null) {
//regex search records the chunks in the artifact
String chunkIDsString = docIDsArtifact.getValueString();
Set<String> chunkIDs = Arrays.stream(chunkIDsString.split(",")).map(StringUtils::strip).collect(Collectors.toSet());
for (String solrDocumentId : chunkIDs) {
int chunkID;
final int separatorIndex = solrDocumentId.indexOf(Server.CHUNK_ID_SEPARATOR);
if (-1 != separatorIndex) {
chunkID = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1));
} else {
@ -142,6 +148,25 @@ class HighlightedText implements IndexedText {
numberOfHitsPerPage.put(chunkID, 0);
currentHitPerPage.put(chunkID, 0);
}
this.currentPage = pages.stream().sorted().findFirst().orElse(1);
isPageInfoLoaded = true;
} else {
/*
* non-regex searches don't record the chunks in the artifacts, so
* we need to look them up
*/
Keyword keywordQuery = new Keyword(keyword, true);
KeywordSearchQuery chunksQuery
= new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery);
chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
try {
hits = chunksQuery.performQuery();
loadPageInfoFromHits();
} catch (KeywordSearchModuleException | NoOpenCoreException ex) {
logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS
MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage());
}
}
}
/**
@ -154,45 +179,6 @@ class HighlightedText implements IndexedText {
*
* @return
*/
static private String getHighlightQuery(KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, Content content) {
if (literal_query) {
//literal, treat as non-regex, non-term component query
return constructEscapedSolrQuery(query.getQueryString());
} else //construct a Solr query using aggregated terms to get highlighting
//the query is executed later on demand
{
if (queryResults.getKeywords().size() == 1) {
//simple case, no need to process subqueries and do special escaping
Keyword keyword = queryResults.getKeywords().iterator().next();
return constructEscapedSolrQuery(keyword.getSearchTerm());
} else {
//find terms for this content hit
List<Keyword> hitTerms = new ArrayList<>();
for (Keyword keyword : queryResults.getKeywords()) {
for (KeywordHit hit : queryResults.getResults(keyword)) {
if (hit.getContent().equals(content)) {
hitTerms.add(keyword);
break; //go to next term
}
}
}
StringBuilder highlightQuery = new StringBuilder();
final int lastTerm = hitTerms.size() - 1;
int curTerm = 0;
for (Keyword term : hitTerms) {
//escape subqueries, MAKE SURE they are not escaped again later
highlightQuery.append(constructEscapedSolrQuery(term.getSearchTerm()));
if (lastTerm != curTerm) {
highlightQuery.append(" "); //acts as OR ||
}
++curTerm;
}
return highlightQuery.toString();
}
}
}
/**
* Constructs a complete, escaped Solr query that is ready to be used.
@ -236,9 +222,7 @@ class HighlightedText implements IndexedText {
*/ loadPageInfoFromArtifact();
} else if (hasChunks) {
// if the file has chunks, get pages with hits, sorted
if (loadPageInfoFromHits()) {
//JMTOD: look at error handeling and return values...
}
loadPageInfoFromHits();
} else {
//non-regex, no chunks
this.numberPages = 1;
@ -246,29 +230,12 @@ class HighlightedText implements IndexedText {
numberOfHitsPerPage.put(1, 0);
pages.add(1);
currentHitPerPage.put(1, 0);
}
isPageInfoLoaded = true;
}
private boolean loadPageInfoFromHits() {
// /*
// * If this is being called from the artifacts / dir tree, then we need
// * to perform the search to get the highlights.
// */
// if (hits == null) {
//
// Keyword keywordQuery = new Keyword(keywordHitQuery, true);
// KeywordSearchQuery chunksQuery
// = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery);
// chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
// try {
// hits = chunksQuery.performQuery();
// } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
// logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS
// MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage());
// return true;
// }
//// }
}
private void loadPageInfoFromHits() {
//organize the hits by page, filter as needed
TreeSet<Integer> pagesSorted = new TreeSet<>();
@ -277,11 +244,12 @@ class HighlightedText implements IndexedText {
int chunkID = hit.getChunkId();
if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
pagesSorted.add(chunkID);
if (StringUtils.isNotBlank(hit.getHit())) {
this.keywords.add(hit.getHit());
}
}
}
}
//set page to first page having highlights
if (pagesSorted.isEmpty()) {
this.currentPage = 0;
@ -293,7 +261,7 @@ class HighlightedText implements IndexedText {
pages.add(page);
currentHitPerPage.put(page, 0); //set current hit to 0th
}
return false;
isPageInfoLoaded = true;
}
@Override
@ -410,26 +378,29 @@ class HighlightedText implements IndexedText {
}
final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
// if (isRegex) {
if (artifact != null && qt == QueryType.REGEX) {
q.setQuery(filterQuery);
q.addField(Server.Schema.CONTENT_STR.toString());
// } else {
// // input query has already been properly constructed and escaped
// q.setQuery(keywordHitQuery);
// q.addField(Server.Schema.TEXT.toString());
// q.addFilterQuery(filterQuery);
// q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
// q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
//
// //tune the highlighter
// q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
// q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
// q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
// q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
//
// //docs says makes sense for the original Highlighter only, but not really
// q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
// }
} else {
final String highlightQuery = keywords.stream()
.map(HighlightedText::constructEscapedSolrQuery)
.collect(Collectors.joining(" "));
q.setQuery(highlightQuery);
q.addField(Server.Schema.TEXT.toString());
q.addFilterQuery(filterQuery);
q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
//tune the highlighter
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
//docs says makes sense for the original Highlighter only, but not really
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
}
try {
QueryResponse response = solrServer.query(q, METHOD.POST);
@ -532,8 +503,7 @@ class HighlightedText implements IndexedText {
for (String unquotedKeyword : keywords) {
int textOffset = 0;
int hitOffset;
while ((hitOffset = text.indexOf(unquotedKeyword, textOffset)) != -1) {
while ((hitOffset = StringUtils.indexOfIgnoreCase(text, unquotedKeyword, textOffset)) != -1) {
// Append the portion of text up to (but not including) the hit.
highlightedText.append(text.substring(textOffset, hitOffset));
// Add in the highlighting around the keyword.
@ -542,12 +512,11 @@ class HighlightedText implements IndexedText {
highlightedText.append(HIGHLIGHT_POST);
// Advance the text offset past the keyword.
textOffset = hitOffset + unquotedKeyword.length() + 1;
textOffset = hitOffset + unquotedKeyword.length();
}
if (highlightedText.length() > 0) {
// Append the remainder of text field and return.
// Append the remainder of text field
highlightedText.append(text.substring(textOffset, text.length()));
if (highlightedText.length() > 0) {
} else {
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");

View File

@ -45,7 +45,7 @@ import org.sleuthkit.datamodel.File;
*/
class KeywordSearchFilterNode extends FilterNode {
KeywordSearchFilterNode(HighlightedText highlights, Node original) {
KeywordSearchFilterNode(QueryResults highlights, Node original) {
super(original, null, new ProxyLookup(Lookups.singleton(highlights), original.getLookup()));
}

View File

@ -147,7 +147,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
int hitNumber = 0;
List<KeyValueQueryContent> tempList = new ArrayList<>();
// final SetMultimap<Long, KeywordHit> orgnizeByDocID = orgnizeByDocID(queryResults);
for (KeywordHit hit : getOneHitPerObject(queryResults)) {
/**
@ -169,12 +168,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
properties.put(TSK_KEYWORD_PREVIEW.getDisplayName(), hit.getSnippet());
}
//JMTODO: I don't understand this comment or the below code...
//@@@ USE ConentHit in UniqueFileMap instead of the below search
//get unique match result files
// BC: @@@ THis is really ineffecient. We should keep track of this when
// we flattened the list of files to the unique files.
// final String highlightQueryEscaped = getHighlightQuery(queryRequest, queryRequest.isLiteral(), queryResults, content);
String hitName = hit.isArtifactHit()
? hit.getArtifact().getDisplayName() + " Artifact" //NON-NLS
: contentName;
@ -220,18 +213,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
return hits.values();
}
SetMultimap<Long, KeywordHit> orgnizeByDocID(QueryResults queryResults) {
SetMultimap<Long, KeywordHit> hits = TreeMultimap.create(Long::compare, Comparator.comparing(KeywordHit::getChunkId));
for (Keyword keyWord : queryResults.getKeywords()) {
for (KeywordHit hit : queryResults.getResults(keyWord)) {
hits.put(hit.getSolrObjectId(), hit);
}
}
return hits;
}
@Override
protected Node createNodeForKey(KeyValueQueryContent key) {
final Content content = key.getContent();
@ -240,9 +221,7 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
Node kvNode = new KeyValueNode(key, Children.LEAF, Lookups.singleton(content));
//wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization
// store the data in HighlightedMatchesSource so that it can be looked up (in content viewer)
HighlightedText highlights = new HighlightedText(key.getSolrObjectId(), hits);
return new KeywordSearchFilterNode(highlights, kvNode);
return new KeywordSearchFilterNode(hits, kvNode);
}
/**
@ -277,8 +256,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
this.hits = hits;
this.query = query;
// boolean isRegex = hits.getQuery().isLiteral() == false;
// this.chunkIDs = chunkIDs;
}
Content getContent() {

View File

@ -239,7 +239,7 @@ class LuceneQuery implements KeywordSearchQuery {
for (Object content_obj : content) {
String content_str = (String) content_obj;
//for new schemas, check that the hit is before the chunk/window boundary.
int firstOccurence = StringUtils.indexOf(content_str.toLowerCase(), strippedQueryString.toLowerCase());
int firstOccurence = StringUtils.indexOfIgnoreCase(content_str, strippedQueryString);
//there is no chunksize field for "parent" entries in the index
if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) {
matches.add(createKeywordtHit(highlightResponse, docId));
@ -324,7 +324,7 @@ class LuceneQuery implements KeywordSearchQuery {
}
}
return new KeywordHit(docId, snippet);
return new KeywordHit(docId, snippet, keywordString);
}
/**