From bcc45285ee22a8f9a243b0651d5542b49266f3d8 Mon Sep 17 00:00:00 2001 From: jmillman Date: Thu, 28 Jul 2016 17:07:09 -0400 Subject: [PATCH] cleanup AccountsText even more. --- .../autopsy/keywordsearch/AccountsText.java | 224 +++++++++--------- .../keywordsearch/ExtractedContentViewer.java | 2 +- .../autopsy/keywordsearch/Server.java | 32 +-- 3 files changed, 127 insertions(+), 131 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java index 40b5e1430e..173b3f8186 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java @@ -19,7 +19,6 @@ package org.sleuthkit.autopsy.keywordsearch; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; @@ -31,6 +30,7 @@ import java.util.logging.Level; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrRequest.METHOD; import org.apache.solr.client.solrj.response.QueryResponse; @@ -38,39 +38,48 @@ import org.apache.solr.common.SolrDocument; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Version; -import org.sleuthkit.autopsy.datamodel.TextMarkupLookup; /** * Highlights account hits for a given document. Knows about pages and such for * the content viewer. + * + * Note: This class started as a copy-and-paste of HighlightedText, but it + * proved too messy to modify HighlightedText to work for accounts also. This + * and HighlightedText are very similar and could probably use some refactoring + * to reduce code duplication. */ -class AccountsText implements IndexedText, TextMarkupLookup { +class AccountsText implements IndexedText { private static final Logger LOGGER = Logger.getLogger(AccountsText.class.getName()); + private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT); + private static final String HIGHLIGHT_PRE = ""; //NON-NLS private static final String HIGHLIGHT_POST = ""; //NON-NLS - private static final String ANCHOR_PREFIX = AccountsText.class.getName() + "_"; + private static final String ANCHOR_NAME_PREFIX = AccountsText.class.getName() + "_"; + + private static final String INSERT_PREFIX = "$0"; //$0 will insert current regex match //NON-NLS + private static final Pattern ANCHOR_DETECTION_PATTERN = Pattern.compile(HIGHLIGHT_PRE); + + private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD_REGEX; - private final String solrDocumentId; - private final Set keywords = new HashSet<>(); private final Server solrServer; - private int numberPagesForFile = 0; - private int currentPage = 0; - private boolean hasChunks = false; - //stores all pages/chunks that have hits as key, and number of hits as a value, or 0 if yet unknown - private final LinkedHashMap numberOfHitsPerPage = new LinkedHashMap<>(); - //stored page num -> current hit number mapping - private final HashMap currentHitPerPage = new HashMap<>(); - private final List pages = new ArrayList<>(); - private boolean isPageInfoLoaded = false; - private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT); - private final String displayName; + private final String solrDocumentId; private final long solrObjectId; private final Integer chunkId; + private final Set keywords = new HashSet<>(); + private final String displayName; + private final String queryString; - String getDisplayName() { - return displayName; - } + private boolean isPageInfoLoaded = false; + private int numberPagesForFile = 0; + private int currentPage = 0; + //list of pages, used for iterating back and forth. Only stores pages with hits + private final List pages = new ArrayList<>(); + //map from page/chunk to number of hits. value is 0 if not yet known. + private final LinkedHashMap numberOfHitsPerPage = new LinkedHashMap<>(); + //map from page/chunk number to current hit on that page. + private final HashMap currentHitPerPage = new HashMap<>(); @NbBundle.Messages({ "AccountsText.creditCardNumber=Credit Card Number", @@ -78,6 +87,13 @@ class AccountsText implements IndexedText, TextMarkupLookup { AccountsText(String objectId, Set keywords) { this.solrDocumentId = objectId; this.keywords.addAll(keywords); + + //build the query string + this.queryString = HIGHLIGHT_FIELD + ":" + + keywords.stream() + .map(keyword -> "/.*?" + KeywordSearchUtil.escapeLuceneQuery(keyword) + ".*?/")//surround each "keyword" with match anything regex. + .collect(Collectors.joining(" ")); //collect as space separated string + this.solrServer = KeywordSearch.getServer(); final int separatorIndex = solrDocumentId.indexOf(Server.ID_CHUNK_SEP); @@ -123,22 +139,24 @@ class AccountsText implements IndexedText, TextMarkupLookup { } @Override + @NbBundle.Messages("AccountsText.nextPage.exception.msg=No next page.") public int nextPage() { if (hasNextPage()) { currentPage = pages.get(pages.indexOf(this.currentPage) + 1); return currentPage; } else { - throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.nextPage.exception.msg")); + throw new IllegalStateException(Bundle.AccountsText_nextPage_exception_msg()); } } @Override + @NbBundle.Messages("AccountsText.previousPage.exception.msg=No previous page.") public int previousPage() { if (hasPreviousPage()) { currentPage = pages.get(pages.indexOf(this.currentPage) - 1); return currentPage; } else { - throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.previousPage.exception.msg")); + throw new IllegalStateException(Bundle.AccountsText_previousPage_exception_msg()); } } @@ -161,20 +179,22 @@ class AccountsText implements IndexedText, TextMarkupLookup { } @Override + @NbBundle.Messages("AccountsText.nextItem.exception.msg=No next item.") public int nextItem() { if (hasNextItem()) { return currentHitPerPage.merge(currentPage, 1, Integer::sum); } else { - throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.nextItem.exception.msg")); + throw new IllegalStateException(Bundle.AccountsText_nextItem_exception_msg()); } } @Override + @NbBundle.Messages("AccountsText.previousItem.exception.msg=No previous item.") public int previousItem() { if (hasPreviousItem()) { return currentHitPerPage.merge(currentPage, -1, Integer::sum); } else { - throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.previousItem.exception.msg")); + throw new IllegalStateException(Bundle.AccountsText_previousItem_exception_msg()); } } @@ -193,54 +213,43 @@ class AccountsText implements IndexedText, TextMarkupLookup { } /** - * The main goal of this method is to figure out which pages / chunks have - * hits. + * Initialize this object with information about which pages/chunks have + * hits. Multiple calls will not change the initial results. */ synchronized private void loadPageInfo() { if (isPageInfoLoaded) { return; } - if (chunkId != null) { - //if a chunk is specified, only show that chunk/page - this.numberPagesForFile = 1; - hasChunks = false; - //no chunks + if (chunkId != null) {//if a chunk is specified, only show that chunk/page this.numberPagesForFile = 1; this.currentPage = chunkId; - numberOfHitsPerPage.put(chunkId, 0); - pages.add(chunkId); - currentHitPerPage.put(chunkId, 0); + this.numberOfHitsPerPage.put(chunkId, 0); + this.pages.add(chunkId); + this.currentHitPerPage.put(chunkId, 0); } else { - hasChunks = true; try { this.numberPagesForFile = solrServer.queryNumFileChunks(this.solrObjectId); } catch (KeywordSearchModuleException | NoOpenCoreException ex) { - LOGGER.log(Level.WARNING, "Could not get number pages for content: {0}", this.solrDocumentId); //NON-NLS + LOGGER.log(Level.WARNING, "Could not get number pages for content " + this.solrDocumentId, ex); //NON-NLS return; } //if has chunks, get pages with hits TreeSet sortedPagesWithHits = new TreeSet<>(); - //extract pages of interest, sorted - SolrQuery q = new SolrQuery(); q.setShowDebugInfo(DEBUG); //debug - String query = keywords.stream().map(keyword -> "/.*" + KeywordSearchUtil.escapeLuceneQuery(keyword) + ".*/").collect(Collectors.joining(" ")); - q.setQuery(LuceneQuery.HIGHLIGHT_FIELD_REGEX + ":" + query); - q.setFields("id"); - if (chunkId == null) { - q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.solrObjectId + "_*"); - } else { - q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.solrDocumentId); - } + q.setQuery(queryString); + q.setFields(Server.Schema.ID.toString()); //for this case we only need the document ids + q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.solrObjectId + Server.ID_CHUNK_SEP + "*"); + try { QueryResponse response = solrServer.query(q, METHOD.POST); for (SolrDocument resultDoc : response.getResults()) { final String resultDocumentId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString(); // Put the solr chunk id in the map - final int separatorIndex = resultDocumentId.indexOf(Server.ID_CHUNK_SEP); - if (-1 != separatorIndex) { - sortedPagesWithHits.add(Integer.parseInt(resultDocumentId.substring(separatorIndex + 1))); + String resultChunkID = StringUtils.substringAfter(resultDocumentId, Server.ID_CHUNK_SEP); + if (StringUtils.isNotBlank(resultChunkID)) { + sortedPagesWithHits.add(Integer.parseInt(resultChunkID)); } else { sortedPagesWithHits.add(0); } @@ -268,63 +277,79 @@ class AccountsText implements IndexedText, TextMarkupLookup { } @Override + @NbBundle.Messages({"AccountsText.getMarkup.noMatchMsg=" + + "
There were no keyword hits on this page. 
" + + "The keyword could have been in the file name." + + "
Advance to another page if present, or to view the original text, choose File Text" + + "
in the drop down menu to the right...
", + "AccountsText.getMarkup.queryFailedMsg=" + + "
Failed to retrieve keyword hit results."
+        + " 
Confirm that Autopsy can connect to the Solr server. " + + "
"}) public String getText() { loadPageInfo(); //inits once - String highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX; - SolrQuery q = new SolrQuery(); q.setShowDebugInfo(DEBUG); //debug - String query = keywords.stream().map(keyword -> "/.*" + KeywordSearchUtil.escapeLuceneQuery(keyword) + ".*/").collect(Collectors.joining(" ")); - q.setQuery(LuceneQuery.HIGHLIGHT_FIELD_REGEX + ":" + query); + q.addHighlightField(HIGHLIGHT_FIELD); + q.setQuery(queryString); - String contentIdStr; - if (hasChunks) { - contentIdStr = solrObjectId + "_" + Integer.toString(this.currentPage); - } else { - contentIdStr = this.solrDocumentId; - } + //set the documentID filter + String queryDocumentID = this.solrObjectId + Server.ID_CHUNK_SEP + this.currentPage; + q.addFilterQuery(Server.Schema.ID.toString() + ":" + queryDocumentID); - final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr); - q.addFilterQuery(filterQuery); - q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema) - - //tune the highlighter + //configure the highlighter q.setParam("hl.useFastVectorHighlighter", "true"); //fast highlighter scales better than standard one NON-NLS q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS - - //docs says makes sense for the original Highlighter only, but not really - q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS + q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //docs says makes sense for the original Highlighter only, but not really //NON-NLS try { - QueryResponse response = solrServer.query(q, METHOD.POST); - Map>> responseHighlight = response.getHighlighting(); - - Map> responseHighlightID = responseHighlight.get(contentIdStr); - if (responseHighlightID == null) { - return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg"); + //extract highlighting and bail early on null responses + Map>> highlightingPerDocument = solrServer.query(q, METHOD.POST).getHighlighting(); + Map> highlightingPerField = highlightingPerDocument.get(queryDocumentID); + if (highlightingPerField == null) { + return Bundle.AccountsText_getMarkup_noMatchMsg(); } - List contentHighlights = responseHighlightID.get(highLightField); - if (contentHighlights == null) { - return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg"); - } else { - // extracted content (minus highlight tags) is HTML-escaped - String highlightedContent = contentHighlights.get(0).trim(); - highlightedContent = insertAnchors(highlightedContent); - - return "
" + highlightedContent + "
"; //NON-NLS + List highlights = highlightingPerField.get(HIGHLIGHT_FIELD); + if (highlights == null) { + return Bundle.AccountsText_getMarkup_noMatchMsg(); } + + //There should only be one item + String highlighting = highlights.get(0).trim(); + + /* + * use regex matcher to iterate over occurences of HIGHLIGHT_PRE, + * and prepend them with an anchor tag. + */ + Matcher m = ANCHOR_DETECTION_PATTERN.matcher(highlighting); + StringBuffer sb = new StringBuffer(highlighting.length()); + int count = 0; + while (m.find()) { + count++; + m.appendReplacement(sb, INSERT_PREFIX + count + INSERT_POSTFIX); + } + m.appendTail(sb); + + //store total hits for this page, now that we know it + this.numberOfHitsPerPage.put(this.currentPage, count); + if (this.currentItem() == 0 && this.hasNextItem()) { + this.nextItem(); + } + + // extracted content (minus highlight tags) is HTML-escaped + return "
" + sb.toString() + "
"; //NON-NLS } catch (Exception ex) { LOGGER.log(Level.WARNING, "Error executing Solr highlighting query: " + keywords, ex); //NON-NLS - return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg"); + return Bundle.AccountsText_getMarkup_queryFailedMsg(); } } @Override public String toString() { - return getDisplayName(); + return displayName; } @Override @@ -334,7 +359,7 @@ class AccountsText implements IndexedText, TextMarkupLookup { @Override public String getAnchorPrefix() { - return ANCHOR_PREFIX; + return ANCHOR_NAME_PREFIX; } @Override @@ -344,33 +369,4 @@ class AccountsText implements IndexedText, TextMarkupLookup { } return this.numberOfHitsPerPage.get(this.currentPage); } - - private String insertAnchors(String searchableContent) { - - final String insertPre = "$0"; //$0 will insert current regex match //NON-NLS - - Matcher m = Pattern.compile(HIGHLIGHT_PRE).matcher(searchableContent); - StringBuffer sb = new StringBuffer(searchableContent.length()); - int count; - for (count = 0; m.find(); count++) { - m.appendReplacement(sb, insertPre + count + insertPost); - } - m.appendTail(sb); - - //store total hits for this page, now that we know it - this.numberOfHitsPerPage.put(this.currentPage, count); - if (this.currentItem() == 0 && this.hasNextItem()) { - this.nextItem(); - } - - return sb.toString(); - } - - @Override - @Deprecated - // factory method to create an instance of this object - public AccountsText createInstance(long objectId, String keywordHitQuery, boolean isRegex, String originalQuery) { - return new AccountsText(String.valueOf(objectId), Collections.emptySet()); - } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java index 2ffccf5ce3..2dfaa0b4ec 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java @@ -184,7 +184,7 @@ public class ExtractedContentViewer implements DataContentViewer { */ BlackboardArtifact artifact = nodeLookup.lookup(BlackboardArtifact.class); if (null != artifact) { - /** + /* * For keyword hit artifacts, add the text of the artifact that hit, * not the hit artifact; otherwise add the text for the artifact. */ diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java index cd9d1572c9..33a5a1d443 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java @@ -40,32 +40,32 @@ import java.util.Collection; import java.util.List; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.logging.Level; -import org.openide.util.NbBundle; -import org.sleuthkit.autopsy.coreutils.Logger; import javax.swing.AbstractAction; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.client.solrj.impl.XMLResponseParser; import org.apache.solr.client.solrj.request.CoreAdminRequest; +import org.apache.solr.client.solrj.response.CoreAdminResponse; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.TermsResponse; -import org.apache.solr.client.solrj.SolrRequest; -import org.apache.solr.client.solrj.impl.HttpSolrServer; -import org.apache.solr.common.util.NamedList; -import org.openide.modules.InstalledFileLocator; -import org.openide.modules.Places; -import org.sleuthkit.autopsy.casemodule.Case; -import org.sleuthkit.autopsy.coreutils.ModuleSettings; -import org.sleuthkit.autopsy.coreutils.PlatformUtil; -import org.sleuthkit.datamodel.Content; -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.client.solrj.impl.XMLResponseParser; -import org.apache.solr.client.solrj.response.CoreAdminResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.util.NamedList; +import org.openide.modules.InstalledFileLocator; +import org.openide.modules.Places; +import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.Case.CaseType; -import org.sleuthkit.autopsy.coreutils.UNCPathUtilities; import org.sleuthkit.autopsy.core.UserPreferences; +import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.coreutils.ModuleSettings; +import org.sleuthkit.autopsy.coreutils.PlatformUtil; +import org.sleuthkit.autopsy.coreutils.UNCPathUtilities; +import org.sleuthkit.datamodel.Content; /** * Handles management of a either a local or centralized Solr server and its @@ -157,7 +157,7 @@ public class Server { private static final Logger logger = Logger.getLogger(Server.class.getName()); private static final String DEFAULT_CORE_NAME = "coreCase"; //NON-NLS public static final String CORE_EVT = "CORE_EVT"; //NON-NLS - public static final char ID_CHUNK_SEP = '_'; + public static final String ID_CHUNK_SEP = "_"; private String javaPath = "java"; //NON-NLS public static final Charset DEFAULT_INDEXED_TEXT_CHARSET = Charset.forName("UTF-8"); ///< default Charset to index text as private static final int MAX_SOLR_MEM_MB = 512; //TODO set dynamically based on avail. system resources