From 6abe26d6de23c51880eb2dfc94807048e973c07c Mon Sep 17 00:00:00 2001 From: millmanorama Date: Wed, 15 Feb 2017 13:48:43 +0100 Subject: [PATCH] finish implementing all the cases for highlighting --- .../keywordsearch/ExtractedContentViewer.java | 40 ++-- .../keywordsearch/HighlightedText.java | 187 ++++++++---------- .../KeywordSearchFilterNode.java | 2 +- .../KeywordSearchResultFactory.java | 25 +-- .../autopsy/keywordsearch/LuceneQuery.java | 4 +- 5 files changed, 102 insertions(+), 156 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java index 36f187d9b6..0a8c15a728 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java @@ -110,15 +110,13 @@ public class ExtractedContentViewer implements DataContentViewer { BlackboardArtifact artifact = nodeLookup.lookup(BlackboardArtifact.class); if (hits != null) { highlightedHitText = new HighlightedText(content.getId(), hits); - } else { - if (artifact != null && artifact.getArtifactTypeID() - == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) { - // if the artifact is an account artifact, get an account text . - highlightedHitText = getAccountsText(content, nodeLookup); - } else if (artifact != null && artifact.getArtifactTypeID() - == BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT.getTypeID()) { - highlightedHitText = new HighlightedText(artifact); - } + } else if (artifact != null && artifact.getArtifactTypeID() + == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) { + // if the artifact is an account artifact, get an account text . + highlightedHitText = getAccountsText(content, nodeLookup); + } else if (artifact != null && artifact.getArtifactTypeID() + == BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT.getTypeID()) { + highlightedHitText = new HighlightedText(artifact); } if (highlightedHitText != null) { indexedTextSources.add(highlightedHitText); @@ -298,16 +296,16 @@ public class ExtractedContentViewer implements DataContentViewer { return false; } - /** - * Is there any marked up indexed text in the look up of this node? This - * will be the case if the node is for a keyword hit artifact produced - * by either an ad hoc keyword search result (keyword search toolbar - * widgets) or a keyword search by the keyword search ingest module. - */ - Collection sources = node.getLookup().lookupAll(IndexedText.class); - if (sources.isEmpty() == false) { - return true; - } +// /** +// * Is there any marked up indexed text in the look up of this node? This +// * will be the case if the node is for a keyword hit artifact produced +// * by either an ad hoc keyword search result (keyword search toolbar +// * widgets) or a keyword search by the keyword search ingest module. +// */ +// Collection sources = node.getLookup().lookupAll(IndexedText.class); +// if (sources.isEmpty() == false) { +// return true; +// } /* * Is there a credit card artifact in the lookup @@ -315,7 +313,9 @@ public class ExtractedContentViewer implements DataContentViewer { Collection artifacts = node.getLookup().lookupAll(BlackboardArtifact.class); if (artifacts != null) { for (BlackboardArtifact art : artifacts) { - if (art.getArtifactTypeID() == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) { + final int artifactTypeID = art.getArtifactTypeID(); + if (artifactTypeID == TSK_ACCOUNT.getTypeID() + || artifactTypeID == TSK_KEYWORD_HIT.getTypeID()) { return true; } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java index 157f303b64..cea6295f96 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java @@ -18,7 +18,6 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import com.ibm.icu.text.UnicodeSet; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -28,7 +27,6 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; -import java.util.function.Function; import java.util.logging.Level; import java.util.stream.Collectors; import org.apache.commons.lang.StringEscapeUtils; @@ -40,14 +38,13 @@ import org.apache.solr.common.SolrDocumentList; import org.openide.util.Exceptions; import org.openide.util.NbBundle; import org.openide.util.NbBundle.Messages; -import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil; import org.sleuthkit.autopsy.coreutils.Version; import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType; +import org.sleuthkit.autopsy.keywordsearch.KeywordSearch.QueryType; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardAttribute; -import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; /** @@ -89,6 +86,7 @@ class HighlightedText implements IndexedText { private boolean isPageInfoLoaded = false; private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT); private BlackboardArtifact artifact; + private KeywordSearch.QueryType qt; /** * This constructor is used when keyword hits are accessed from the ad-hoc @@ -123,24 +121,51 @@ class HighlightedText implements IndexedText { } private void loadPageInfoFromArtifact() throws TskCoreException, NumberFormatException { + final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString(); + this.keywords.add(keyword); - KeywordSearch.QueryType qt = KeywordSearch.QueryType.values()[artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE).getValueInt()]; - this.keywords.add(artifact.getAttribute(TSK_KEYWORD).getValueString()); - String chunkIDsString = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS).getValueString(); - Set chunkIDs = Arrays.stream(chunkIDsString.split(",")).map(StringUtils::strip).collect(Collectors.toSet()); - for (String solrDocumentId : chunkIDs) { - int chunkID; - final int separatorIndex = solrDocumentId.indexOf(Server.CHUNK_ID_SEPARATOR); - if (-1 != separatorIndex) { + final BlackboardAttribute qtAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE); - chunkID = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1)); - } else { + qt = (qtAttribute != null) + ? KeywordSearch.QueryType.values()[qtAttribute.getValueInt()] : null; - chunkID = 0; + final BlackboardAttribute docIDsArtifact = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS); + + if (qt == QueryType.REGEX && docIDsArtifact != null) { + //regex search records the chunks in the artifact + String chunkIDsString = docIDsArtifact.getValueString(); + Set chunkIDs = Arrays.stream(chunkIDsString.split(",")).map(StringUtils::strip).collect(Collectors.toSet()); + for (String solrDocumentId : chunkIDs) { + int chunkID; + final int separatorIndex = solrDocumentId.indexOf(Server.CHUNK_ID_SEPARATOR); + if (-1 != separatorIndex) { + chunkID = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1)); + } else { + + chunkID = 0; + } + pages.add(chunkID); + numberOfHitsPerPage.put(chunkID, 0); + currentHitPerPage.put(chunkID, 0); + } + this.currentPage = pages.stream().sorted().findFirst().orElse(1); + isPageInfoLoaded = true; + } else { + /* + * non-regex searches don't record the chunks in the artifacts, so + * we need to look them up + */ + Keyword keywordQuery = new Keyword(keyword, true); + KeywordSearchQuery chunksQuery + = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery); + chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId)); + try { + hits = chunksQuery.performQuery(); + loadPageInfoFromHits(); + } catch (KeywordSearchModuleException | NoOpenCoreException ex) { + logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS + MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage()); } - pages.add(chunkID); - numberOfHitsPerPage.put(chunkID, 0); - currentHitPerPage.put(chunkID, 0); } } @@ -154,45 +179,6 @@ class HighlightedText implements IndexedText { * * @return */ - static private String getHighlightQuery(KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, Content content) { - if (literal_query) { - //literal, treat as non-regex, non-term component query - return constructEscapedSolrQuery(query.getQueryString()); - } else //construct a Solr query using aggregated terms to get highlighting - //the query is executed later on demand - { - if (queryResults.getKeywords().size() == 1) { - //simple case, no need to process subqueries and do special escaping - Keyword keyword = queryResults.getKeywords().iterator().next(); - return constructEscapedSolrQuery(keyword.getSearchTerm()); - } else { - //find terms for this content hit - List hitTerms = new ArrayList<>(); - for (Keyword keyword : queryResults.getKeywords()) { - for (KeywordHit hit : queryResults.getResults(keyword)) { - if (hit.getContent().equals(content)) { - hitTerms.add(keyword); - break; //go to next term - } - } - } - - StringBuilder highlightQuery = new StringBuilder(); - final int lastTerm = hitTerms.size() - 1; - int curTerm = 0; - for (Keyword term : hitTerms) { - //escape subqueries, MAKE SURE they are not escaped again later - highlightQuery.append(constructEscapedSolrQuery(term.getSearchTerm())); - if (lastTerm != curTerm) { - highlightQuery.append(" "); //acts as OR || - } - - ++curTerm; - } - return highlightQuery.toString(); - } - } - } /** * Constructs a complete, escaped Solr query that is ready to be used. @@ -236,9 +222,7 @@ class HighlightedText implements IndexedText { */ loadPageInfoFromArtifact(); } else if (hasChunks) { // if the file has chunks, get pages with hits, sorted - if (loadPageInfoFromHits()) { - //JMTOD: look at error handeling and return values... - } + loadPageInfoFromHits(); } else { //non-regex, no chunks this.numberPages = 1; @@ -246,29 +230,12 @@ class HighlightedText implements IndexedText { numberOfHitsPerPage.put(1, 0); pages.add(1); currentHitPerPage.put(1, 0); + isPageInfoLoaded = true; } - isPageInfoLoaded = true; + } - private boolean loadPageInfoFromHits() { -// /* -// * If this is being called from the artifacts / dir tree, then we need -// * to perform the search to get the highlights. -// */ -// if (hits == null) { -// -// Keyword keywordQuery = new Keyword(keywordHitQuery, true); -// KeywordSearchQuery chunksQuery -// = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery); -// chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId)); -// try { -// hits = chunksQuery.performQuery(); -// } catch (KeywordSearchModuleException | NoOpenCoreException ex) { -// logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS -// MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage()); -// return true; -// } -//// } + private void loadPageInfoFromHits() { //organize the hits by page, filter as needed TreeSet pagesSorted = new TreeSet<>(); @@ -277,8 +244,9 @@ class HighlightedText implements IndexedText { int chunkID = hit.getChunkId(); if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) { pagesSorted.add(chunkID); - - this.keywords.add(hit.getHit()); + if (StringUtils.isNotBlank(hit.getHit())) { + this.keywords.add(hit.getHit()); + } } } } @@ -293,7 +261,7 @@ class HighlightedText implements IndexedText { pages.add(page); currentHitPerPage.put(page, 0); //set current hit to 0th } - return false; + isPageInfoLoaded = true; } @Override @@ -410,26 +378,29 @@ class HighlightedText implements IndexedText { } final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr); -// if (isRegex) { - q.setQuery(filterQuery); - q.addField(Server.Schema.CONTENT_STR.toString()); -// } else { -// // input query has already been properly constructed and escaped -// q.setQuery(keywordHitQuery); -// q.addField(Server.Schema.TEXT.toString()); -// q.addFilterQuery(filterQuery); -// q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD); -// q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH -// -// //tune the highlighter -// q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS -// q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS -// q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS -// q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS -// -// //docs says makes sense for the original Highlighter only, but not really -// q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS -// } + if (artifact != null && qt == QueryType.REGEX) { + q.setQuery(filterQuery); + q.addField(Server.Schema.CONTENT_STR.toString()); + } else { + final String highlightQuery = keywords.stream() + .map(HighlightedText::constructEscapedSolrQuery) + .collect(Collectors.joining(" ")); + + q.setQuery(highlightQuery); + q.addField(Server.Schema.TEXT.toString()); + q.addFilterQuery(filterQuery); + q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD); + q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH + + //tune the highlighter + q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS + q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS + q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS + q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS + + //docs says makes sense for the original Highlighter only, but not really + q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS + } try { QueryResponse response = solrServer.query(q, METHOD.POST); @@ -532,8 +503,7 @@ class HighlightedText implements IndexedText { for (String unquotedKeyword : keywords) { int textOffset = 0; int hitOffset; - - while ((hitOffset = text.indexOf(unquotedKeyword, textOffset)) != -1) { + while ((hitOffset = StringUtils.indexOfIgnoreCase(text, unquotedKeyword, textOffset)) != -1) { // Append the portion of text up to (but not including) the hit. highlightedText.append(text.substring(textOffset, hitOffset)); // Add in the highlighting around the keyword. @@ -542,12 +512,11 @@ class HighlightedText implements IndexedText { highlightedText.append(HIGHLIGHT_POST); // Advance the text offset past the keyword. - textOffset = hitOffset + unquotedKeyword.length() + 1; + textOffset = hitOffset + unquotedKeyword.length(); } - + // Append the remainder of text field + highlightedText.append(text.substring(textOffset, text.length())); if (highlightedText.length() > 0) { - // Append the remainder of text field and return. - highlightedText.append(text.substring(textOffset, text.length())); } else { return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg"); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java index bd51e89c95..4da8ccb7fc 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java @@ -45,7 +45,7 @@ import org.sleuthkit.datamodel.File; */ class KeywordSearchFilterNode extends FilterNode { - KeywordSearchFilterNode(HighlightedText highlights, Node original) { + KeywordSearchFilterNode(QueryResults highlights, Node original) { super(original, null, new ProxyLookup(Lookups.singleton(highlights), original.getLookup())); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java index 56dbd49fbf..4d7af13785 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java @@ -147,7 +147,6 @@ class KeywordSearchResultFactory extends ChildFactory { int hitNumber = 0; List tempList = new ArrayList<>(); -// final SetMultimap orgnizeByDocID = orgnizeByDocID(queryResults); for (KeywordHit hit : getOneHitPerObject(queryResults)) { /** @@ -169,12 +168,6 @@ class KeywordSearchResultFactory extends ChildFactory { properties.put(TSK_KEYWORD_PREVIEW.getDisplayName(), hit.getSnippet()); } - //JMTODO: I don't understand this comment or the below code... - //@@@ USE ConentHit in UniqueFileMap instead of the below search - //get unique match result files - // BC: @@@ THis is really ineffecient. We should keep track of this when - // we flattened the list of files to the unique files. -// final String highlightQueryEscaped = getHighlightQuery(queryRequest, queryRequest.isLiteral(), queryResults, content); String hitName = hit.isArtifactHit() ? hit.getArtifact().getDisplayName() + " Artifact" //NON-NLS : contentName; @@ -220,18 +213,6 @@ class KeywordSearchResultFactory extends ChildFactory { return hits.values(); } - SetMultimap orgnizeByDocID(QueryResults queryResults) { - SetMultimap hits = TreeMultimap.create(Long::compare, Comparator.comparing(KeywordHit::getChunkId)); - - for (Keyword keyWord : queryResults.getKeywords()) { - for (KeywordHit hit : queryResults.getResults(keyWord)) { - - hits.put(hit.getSolrObjectId(), hit); - } - } - return hits; - } - @Override protected Node createNodeForKey(KeyValueQueryContent key) { final Content content = key.getContent(); @@ -240,9 +221,7 @@ class KeywordSearchResultFactory extends ChildFactory { Node kvNode = new KeyValueNode(key, Children.LEAF, Lookups.singleton(content)); //wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization - // store the data in HighlightedMatchesSource so that it can be looked up (in content viewer) - HighlightedText highlights = new HighlightedText(key.getSolrObjectId(), hits); - return new KeywordSearchFilterNode(highlights, kvNode); + return new KeywordSearchFilterNode(hits, kvNode); } /** @@ -277,8 +256,6 @@ class KeywordSearchResultFactory extends ChildFactory { this.hits = hits; this.query = query; -// boolean isRegex = hits.getQuery().isLiteral() == false; -// this.chunkIDs = chunkIDs; } Content getContent() { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java index 4eb6773ad7..93a55e7a62 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java @@ -239,7 +239,7 @@ class LuceneQuery implements KeywordSearchQuery { for (Object content_obj : content) { String content_str = (String) content_obj; //for new schemas, check that the hit is before the chunk/window boundary. - int firstOccurence = StringUtils.indexOf(content_str.toLowerCase(), strippedQueryString.toLowerCase()); + int firstOccurence = StringUtils.indexOfIgnoreCase(content_str, strippedQueryString); //there is no chunksize field for "parent" entries in the index if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) { matches.add(createKeywordtHit(highlightResponse, docId)); @@ -324,7 +324,7 @@ class LuceneQuery implements KeywordSearchQuery { } } - return new KeywordHit(docId, snippet); + return new KeywordHit(docId, snippet, keywordString); } /**