cleanup AccountsText even more.

This commit is contained in:
jmillman 2016-07-28 17:07:09 -04:00
parent 2e2d2e2fcd
commit bcc45285ee
3 changed files with 127 additions and 131 deletions

View File

@ -19,7 +19,6 @@
package org.sleuthkit.autopsy.keywordsearch;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
@ -31,6 +30,7 @@ import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse;
@ -38,39 +38,48 @@ import org.apache.solr.common.SolrDocument;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.Version;
import org.sleuthkit.autopsy.datamodel.TextMarkupLookup;
/**
* Highlights account hits for a given document. Knows about pages and such for
* the content viewer.
*
* Note: This class started as a copy-and-paste of HighlightedText, but it
* proved too messy to modify HighlightedText to work for accounts also. This
* and HighlightedText are very similar and could probably use some refactoring
* to reduce code duplication.
*/
class AccountsText implements IndexedText, TextMarkupLookup {
class AccountsText implements IndexedText {
private static final Logger LOGGER = Logger.getLogger(AccountsText.class.getName());
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
private static final String ANCHOR_PREFIX = AccountsText.class.getName() + "_";
private static final String ANCHOR_NAME_PREFIX = AccountsText.class.getName() + "_";
private static final String INSERT_PREFIX = "<a name='" + ANCHOR_NAME_PREFIX; //NON-NLS
private static final String INSERT_POSTFIX = "'></a>$0"; //$0 will insert current regex match //NON-NLS
private static final Pattern ANCHOR_DETECTION_PATTERN = Pattern.compile(HIGHLIGHT_PRE);
private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
private final String solrDocumentId;
private final Set<String> keywords = new HashSet<>();
private final Server solrServer;
private int numberPagesForFile = 0;
private int currentPage = 0;
private boolean hasChunks = false;
//stores all pages/chunks that have hits as key, and number of hits as a value, or 0 if yet unknown
private final LinkedHashMap<Integer, Integer> numberOfHitsPerPage = new LinkedHashMap<>();
//stored page num -> current hit number mapping
private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
private final List<Integer> pages = new ArrayList<>();
private boolean isPageInfoLoaded = false;
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
private final String displayName;
private final String solrDocumentId;
private final long solrObjectId;
private final Integer chunkId;
private final Set<String> keywords = new HashSet<>();
private final String displayName;
private final String queryString;
String getDisplayName() {
return displayName;
}
private boolean isPageInfoLoaded = false;
private int numberPagesForFile = 0;
private int currentPage = 0;
//list of pages, used for iterating back and forth. Only stores pages with hits
private final List<Integer> pages = new ArrayList<>();
//map from page/chunk to number of hits. value is 0 if not yet known.
private final LinkedHashMap<Integer, Integer> numberOfHitsPerPage = new LinkedHashMap<>();
//map from page/chunk number to current hit on that page.
private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
@NbBundle.Messages({
"AccountsText.creditCardNumber=Credit Card Number",
@ -78,6 +87,13 @@ class AccountsText implements IndexedText, TextMarkupLookup {
AccountsText(String objectId, Set<String> keywords) {
this.solrDocumentId = objectId;
this.keywords.addAll(keywords);
//build the query string
this.queryString = HIGHLIGHT_FIELD + ":"
+ keywords.stream()
.map(keyword -> "/.*?" + KeywordSearchUtil.escapeLuceneQuery(keyword) + ".*?/")//surround each "keyword" with match anything regex.
.collect(Collectors.joining(" ")); //collect as space separated string
this.solrServer = KeywordSearch.getServer();
final int separatorIndex = solrDocumentId.indexOf(Server.ID_CHUNK_SEP);
@ -123,22 +139,24 @@ class AccountsText implements IndexedText, TextMarkupLookup {
}
@Override
@NbBundle.Messages("AccountsText.nextPage.exception.msg=No next page.")
public int nextPage() {
if (hasNextPage()) {
currentPage = pages.get(pages.indexOf(this.currentPage) + 1);
return currentPage;
} else {
throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.nextPage.exception.msg"));
throw new IllegalStateException(Bundle.AccountsText_nextPage_exception_msg());
}
}
@Override
@NbBundle.Messages("AccountsText.previousPage.exception.msg=No previous page.")
public int previousPage() {
if (hasPreviousPage()) {
currentPage = pages.get(pages.indexOf(this.currentPage) - 1);
return currentPage;
} else {
throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.previousPage.exception.msg"));
throw new IllegalStateException(Bundle.AccountsText_previousPage_exception_msg());
}
}
@ -161,20 +179,22 @@ class AccountsText implements IndexedText, TextMarkupLookup {
}
@Override
@NbBundle.Messages("AccountsText.nextItem.exception.msg=No next item.")
public int nextItem() {
if (hasNextItem()) {
return currentHitPerPage.merge(currentPage, 1, Integer::sum);
} else {
throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.nextItem.exception.msg"));
throw new IllegalStateException(Bundle.AccountsText_nextItem_exception_msg());
}
}
@Override
@NbBundle.Messages("AccountsText.previousItem.exception.msg=No previous item.")
public int previousItem() {
if (hasPreviousItem()) {
return currentHitPerPage.merge(currentPage, -1, Integer::sum);
} else {
throw new IllegalStateException(NbBundle.getMessage(AccountsText.class, "HighlightedMatchesSource.previousItem.exception.msg"));
throw new IllegalStateException(Bundle.AccountsText_previousItem_exception_msg());
}
}
@ -193,54 +213,43 @@ class AccountsText implements IndexedText, TextMarkupLookup {
}
/**
* The main goal of this method is to figure out which pages / chunks have
* hits.
* Initialize this object with information about which pages/chunks have
* hits. Multiple calls will not change the initial results.
*/
synchronized private void loadPageInfo() {
if (isPageInfoLoaded) {
return;
}
if (chunkId != null) {
//if a chunk is specified, only show that chunk/page
this.numberPagesForFile = 1;
hasChunks = false;
//no chunks
if (chunkId != null) {//if a chunk is specified, only show that chunk/page
this.numberPagesForFile = 1;
this.currentPage = chunkId;
numberOfHitsPerPage.put(chunkId, 0);
pages.add(chunkId);
currentHitPerPage.put(chunkId, 0);
this.numberOfHitsPerPage.put(chunkId, 0);
this.pages.add(chunkId);
this.currentHitPerPage.put(chunkId, 0);
} else {
hasChunks = true;
try {
this.numberPagesForFile = solrServer.queryNumFileChunks(this.solrObjectId);
} catch (KeywordSearchModuleException | NoOpenCoreException ex) {
LOGGER.log(Level.WARNING, "Could not get number pages for content: {0}", this.solrDocumentId); //NON-NLS
LOGGER.log(Level.WARNING, "Could not get number pages for content " + this.solrDocumentId, ex); //NON-NLS
return;
}
//if has chunks, get pages with hits
TreeSet<Integer> sortedPagesWithHits = new TreeSet<>();
//extract pages of interest, sorted
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
String query = keywords.stream().map(keyword -> "/.*" + KeywordSearchUtil.escapeLuceneQuery(keyword) + ".*/").collect(Collectors.joining(" "));
q.setQuery(LuceneQuery.HIGHLIGHT_FIELD_REGEX + ":" + query);
q.setFields("id");
if (chunkId == null) {
q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.solrObjectId + "_*");
} else {
q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.solrDocumentId);
}
q.setQuery(queryString);
q.setFields(Server.Schema.ID.toString()); //for this case we only need the document ids
q.addFilterQuery(Server.Schema.ID.toString() + ":" + this.solrObjectId + Server.ID_CHUNK_SEP + "*");
try {
QueryResponse response = solrServer.query(q, METHOD.POST);
for (SolrDocument resultDoc : response.getResults()) {
final String resultDocumentId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
// Put the solr chunk id in the map
final int separatorIndex = resultDocumentId.indexOf(Server.ID_CHUNK_SEP);
if (-1 != separatorIndex) {
sortedPagesWithHits.add(Integer.parseInt(resultDocumentId.substring(separatorIndex + 1)));
String resultChunkID = StringUtils.substringAfter(resultDocumentId, Server.ID_CHUNK_SEP);
if (StringUtils.isNotBlank(resultChunkID)) {
sortedPagesWithHits.add(Integer.parseInt(resultChunkID));
} else {
sortedPagesWithHits.add(0);
}
@ -268,93 +277,59 @@ class AccountsText implements IndexedText, TextMarkupLookup {
}
@Override
@NbBundle.Messages({"AccountsText.getMarkup.noMatchMsg="
+ "<html><pre><span style\\\\='background\\\\:yellow'>There were no keyword hits on this page. <br />"
+ "The keyword could have been in the file name."
+ " <br />Advance to another page if present, or to view the original text, choose File Text"
+ " <br />in the drop down menu to the right...</span></pre></html>",
"AccountsText.getMarkup.queryFailedMsg="
+ "<html><pre><span style\\\\='background\\\\:yellow'>Failed to retrieve keyword hit results."
+ " <br />Confirm that Autopsy can connect to the Solr server. "
+ "<br /></span></pre></html>"})
public String getText() {
loadPageInfo(); //inits once
String highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
String query = keywords.stream().map(keyword -> "/.*" + KeywordSearchUtil.escapeLuceneQuery(keyword) + ".*/").collect(Collectors.joining(" "));
q.setQuery(LuceneQuery.HIGHLIGHT_FIELD_REGEX + ":" + query);
q.addHighlightField(HIGHLIGHT_FIELD);
q.setQuery(queryString);
String contentIdStr;
if (hasChunks) {
contentIdStr = solrObjectId + "_" + Integer.toString(this.currentPage);
} else {
contentIdStr = this.solrDocumentId;
}
//set the documentID filter
String queryDocumentID = this.solrObjectId + Server.ID_CHUNK_SEP + this.currentPage;
q.addFilterQuery(Server.Schema.ID.toString() + ":" + queryDocumentID);
final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
q.addFilterQuery(filterQuery);
q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)
//tune the highlighter
//configure the highlighter
q.setParam("hl.useFastVectorHighlighter", "true"); //fast highlighter scales better than standard one NON-NLS
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
//docs says makes sense for the original Highlighter only, but not really
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //docs says makes sense for the original Highlighter only, but not really //NON-NLS
try {
QueryResponse response = solrServer.query(q, METHOD.POST);
Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
if (responseHighlightID == null) {
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
}
List<String> contentHighlights = responseHighlightID.get(highLightField);
if (contentHighlights == null) {
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
} else {
// extracted content (minus highlight tags) is HTML-escaped
String highlightedContent = contentHighlights.get(0).trim();
highlightedContent = insertAnchors(highlightedContent);
return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
}
} catch (Exception ex) {
LOGGER.log(Level.WARNING, "Error executing Solr highlighting query: " + keywords, ex); //NON-NLS
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg");
//extract highlighting and bail early on null responses
Map<String, Map<String, List<String>>> highlightingPerDocument = solrServer.query(q, METHOD.POST).getHighlighting();
Map<String, List<String>> highlightingPerField = highlightingPerDocument.get(queryDocumentID);
if (highlightingPerField == null) {
return Bundle.AccountsText_getMarkup_noMatchMsg();
}
List<String> highlights = highlightingPerField.get(HIGHLIGHT_FIELD);
if (highlights == null) {
return Bundle.AccountsText_getMarkup_noMatchMsg();
}
@Override
public String toString() {
return getDisplayName();
}
//There should only be one item
String highlighting = highlights.get(0).trim();
@Override
public boolean isSearchable() {
return true;
}
@Override
public String getAnchorPrefix() {
return ANCHOR_PREFIX;
}
@Override
public int getNumberHits() {
if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
return 0;
}
return this.numberOfHitsPerPage.get(this.currentPage);
}
private String insertAnchors(String searchableContent) {
final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS
final String insertPost = "'></a>$0"; //$0 will insert current regex match //NON-NLS
Matcher m = Pattern.compile(HIGHLIGHT_PRE).matcher(searchableContent);
StringBuffer sb = new StringBuffer(searchableContent.length());
int count;
for (count = 0; m.find(); count++) {
m.appendReplacement(sb, insertPre + count + insertPost);
/*
* use regex matcher to iterate over occurences of HIGHLIGHT_PRE,
* and prepend them with an anchor tag.
*/
Matcher m = ANCHOR_DETECTION_PATTERN.matcher(highlighting);
StringBuffer sb = new StringBuffer(highlighting.length());
int count = 0;
while (m.find()) {
count++;
m.appendReplacement(sb, INSERT_PREFIX + count + INSERT_POSTFIX);
}
m.appendTail(sb);
@ -364,13 +339,34 @@ class AccountsText implements IndexedText, TextMarkupLookup {
this.nextItem();
}
return sb.toString();
// extracted content (minus highlight tags) is HTML-escaped
return "<html><pre>" + sb.toString() + "</pre></html>"; //NON-NLS
} catch (Exception ex) {
LOGGER.log(Level.WARNING, "Error executing Solr highlighting query: " + keywords, ex); //NON-NLS
return Bundle.AccountsText_getMarkup_queryFailedMsg();
}
}
@Override
@Deprecated
// factory method to create an instance of this object
public AccountsText createInstance(long objectId, String keywordHitQuery, boolean isRegex, String originalQuery) {
return new AccountsText(String.valueOf(objectId), Collections.emptySet());
public String toString() {
return displayName;
}
@Override
public boolean isSearchable() {
return true;
}
@Override
public String getAnchorPrefix() {
return ANCHOR_NAME_PREFIX;
}
@Override
public int getNumberHits() {
if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
return 0;
}
return this.numberOfHitsPerPage.get(this.currentPage);
}
}

View File

@ -184,7 +184,7 @@ public class ExtractedContentViewer implements DataContentViewer {
*/
BlackboardArtifact artifact = nodeLookup.lookup(BlackboardArtifact.class);
if (null != artifact) {
/**
/*
* For keyword hit artifacts, add the text of the artifact that hit,
* not the hit artifact; otherwise add the text for the artifact.
*/

View File

@ -40,32 +40,32 @@ import java.util.Collection;
import java.util.List;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import javax.swing.AbstractAction;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.client.solrj.response.CoreAdminResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.TermsResponse;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.common.util.NamedList;
import org.openide.modules.InstalledFileLocator;
import org.openide.modules.Places;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.ModuleSettings;
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.datamodel.Content;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.client.solrj.response.CoreAdminResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.NamedList;
import org.openide.modules.InstalledFileLocator;
import org.openide.modules.Places;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.Case.CaseType;
import org.sleuthkit.autopsy.coreutils.UNCPathUtilities;
import org.sleuthkit.autopsy.core.UserPreferences;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.ModuleSettings;
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.autopsy.coreutils.UNCPathUtilities;
import org.sleuthkit.datamodel.Content;
/**
* Handles management of a either a local or centralized Solr server and its
@ -157,7 +157,7 @@ public class Server {
private static final Logger logger = Logger.getLogger(Server.class.getName());
private static final String DEFAULT_CORE_NAME = "coreCase"; //NON-NLS
public static final String CORE_EVT = "CORE_EVT"; //NON-NLS
public static final char ID_CHUNK_SEP = '_';
public static final String ID_CHUNK_SEP = "_";
private String javaPath = "java"; //NON-NLS
public static final Charset DEFAULT_INDEXED_TEXT_CHARSET = Charset.forName("UTF-8"); ///< default Charset to index text as
private static final int MAX_SOLR_MEM_MB = 512; //TODO set dynamically based on avail. system resources