diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java index 1a110ef9b6..a3d177d3e1 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java @@ -61,7 +61,7 @@ class AccountsText implements IndexedText { private static final String INSERT_POSTFIX = "'>$0"; //$0 will insert current regex match //NON-NLS private static final Pattern ANCHOR_DETECTION_PATTERN = Pattern.compile(HIGHLIGHT_PRE); - private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD_REGEX; + private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD; private final Server solrServer; private final String solrDocumentId; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java index fd9a497b97..8eb782cac2 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java @@ -25,12 +25,15 @@ import java.util.List; import java.util.Map; import java.util.TreeSet; import java.util.logging.Level; +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.lang.StringUtils; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrRequest.METHOD; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocumentList; import org.openide.util.NbBundle.Messages; import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil; import org.sleuthkit.autopsy.coreutils.Version; @@ -82,9 +85,23 @@ class HighlightedText implements IndexedText, TextMarkupLookup { } - //when the results are not known and need to requery to get hits - HighlightedText(long objectId, String solrQuery, boolean isRegex, String originalQuery) { - this(objectId, KeywordSearchUtil.quoteQuery(solrQuery), isRegex); + /** + * This constructor is used when keyword hits are accessed from the + * "Keyword Hits" node in the directory tree in Autopsy. + * In that case we only have the keyword for which a hit had + * previously been found so we will need to re-query to find hits + * for the keyword. + * + * @param objectId + * @param keyword The keyword that was found previously (e.g. during ingest) + * @param isRegex true if the keyword was found via a regular expression search + * @param originalQuery The original query string that produced the hit. If + * isRegex is true, this will be the regular expression that produced the hit. + */ + HighlightedText(long objectId, String keyword, boolean isRegex, String originalQuery) { + // The keyword can be treated as a literal hit at this point so we + // surround it in quotes. + this(objectId, KeywordSearchUtil.quoteQuery(keyword), isRegex); this.originalQuery = originalQuery; } @@ -123,25 +140,15 @@ class HighlightedText implements IndexedText, TextMarkupLookup { hasChunks = true; } - //if has chunks, get pages with hits + // if the file has chunks, get pages with hits, sorted if (hasChunks) { - //extract pages of interest, sorted - /* * If this is being called from the artifacts / dir tree, then we * need to perform the search to get the highlights. */ if (hits == null) { - // I don't undertand how we could get into this code with a regex query. - // Won't all regex queries have been resolved to actual literal keyword hits - // by the time we attempt to load page content? EGS. -// String queryStr = KeywordSearchUtil.escapeLuceneQuery(this.keywordHitQuery); -// if (isRegex) { -// //use white-space sep. field to get exact matches only of regex query result -// queryStr = Server.Schema.CONTENT_WS + ":" + "\"" + queryStr + "\""; -// } + Keyword keywordQuery = new Keyword(keywordHitQuery, !isRegex); - Keyword keywordQuery = new Keyword(this.keywordHitQuery, !isRegex); List keywords = new ArrayList<>(); keywords.add(keywordQuery); KeywordSearchQuery chunksQuery = new LuceneQuery(new KeywordList(keywords), keywordQuery); @@ -303,14 +310,6 @@ class HighlightedText implements IndexedText, TextMarkupLookup { public String getText() { loadPageInfo(); //inits once - String highLightField = null; - - if (isRegex) { - highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX; - } else { - highLightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL; - } - SolrQuery q = new SolrQuery(); q.setShowDebugInfo(DEBUG); //debug @@ -324,10 +323,8 @@ class HighlightedText implements IndexedText, TextMarkupLookup { final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr); q.addFilterQuery(filterQuery); - q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema) + q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD); - //q.setHighlightSimplePre(HIGHLIGHT_PRE); //original highlighter only - //q.setHighlightSimplePost(HIGHLIGHT_POST); //original highlighter only q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH //tune the highlighter @@ -341,22 +338,33 @@ class HighlightedText implements IndexedText, TextMarkupLookup { try { QueryResponse response = solrServer.query(q, METHOD.POST); + + // There should never be more than one document since there will + // either be a single chunk containing hits or we narrow our + // query down to the current page/chunk. + if (response.getResults().size() > 1) { + logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", keywordHitQuery); //NON-NLS + } + Map>> responseHighlight = response.getHighlighting(); Map> responseHighlightID = responseHighlight.get(contentIdStr); - if (responseHighlightID == null) { - return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg"); - } - List contentHighlights = responseHighlightID.get(highLightField); - if (contentHighlights == null) { - return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg"); - } else { - // extracted content (minus highlight tags) is HTML-escaped - String highlightedContent = contentHighlights.get(0).trim(); - highlightedContent = insertAnchors(highlightedContent); + String highlightedContent; - return "
" + highlightedContent + "
"; //NON-NLS + if (responseHighlightID == null) { + highlightedContent = attemptManualHighlighting(response.getResults()); + } else { + List contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD); + if (contentHighlights == null) { + highlightedContent = attemptManualHighlighting(response.getResults()); + } else { + // extracted content (minus highlight tags) is HTML-escaped + highlightedContent = contentHighlights.get(0).trim(); + } } + highlightedContent = insertAnchors(highlightedContent); + + return "
" + highlightedContent + "
"; //NON-NLS } catch (Exception ex) { logger.log(Level.WARNING, "Error executing Solr highlighting query: " + keywordHitQuery, ex); //NON-NLS return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg"); @@ -386,6 +394,73 @@ class HighlightedText implements IndexedText, TextMarkupLookup { return this.hitsPages.get(this.currentPage); } + /** + * If the Solr query does not produce valid highlighting, we attempt to + * add the highlighting ourselves. We do this by taking the text returned + * from the document that contains a hit and searching that text for the + * keyword that produced the hit. + * + * @param solrDocumentList The list of Solr documents returned in response + * to a Solr query. We expect there to only ever be a single document. + * + * @return Either a string with the keyword highlighted or a string + * indicating that we did not find a hit in the document. + */ + private String attemptManualHighlighting(SolrDocumentList solrDocumentList) { + if (solrDocumentList.isEmpty()) { + return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg"); + } + + // It doesn't make sense for there to be more than a single document in + // the list since this class presents a single page (document) of highlighted + // content at a time. + String text = solrDocumentList.get(0).getOrDefault(Server.Schema.TEXT.toString(), "").toString(); + + // Escape any HTML content that may be in the text. This is needed in + // order to correctly display the text in the content viewer. + // Must be done before highlighting tags are added. If we were to + // perform HTML escaping after adding the highlighting tags we would + // not see highlighted text in the content viewer. + text = StringEscapeUtils.escapeHtml(text); + + StringBuilder highlightedText = new StringBuilder(""); + + int textOffset = 0; + // Remove quotes from around the keyword. + String unquotedKeyword = StringUtils.strip(keywordHitQuery, "\""); + // Find the first (if any) hit. + int hitOffset = text.indexOf(unquotedKeyword, textOffset); + + while (hitOffset != -1) { + // Append the portion of text up to (but not including) the hit. + highlightedText.append(text.substring(textOffset, hitOffset)); + // Add in the highlighting around the keyword. + highlightedText.append(HIGHLIGHT_PRE); + highlightedText.append(unquotedKeyword); + highlightedText.append(HIGHLIGHT_POST); + + // Advance the text offset past the keyword. + textOffset = hitOffset + unquotedKeyword.length() + 1; + // Search for the next keyword hit in the text. + hitOffset = text.indexOf(unquotedKeyword, textOffset); + } + + if (highlightedText.length() > 0) { + // Append the remainder of text field and return. + highlightedText.append(text.substring(textOffset, text.length())); + return highlightedText.toString(); + } else { + return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg"); + } + } + + /** + * Anchors are used to navigate back and forth between hits on the same + * page and to navigate to hits on the next/previous page. + * + * @param searchableContent + * @return + */ private String insertAnchors(String searchableContent) { int searchOffset = 0; int index = -1; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java index 8d940468ae..a59fba872c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java @@ -298,13 +298,7 @@ class KeywordSearchResultFactory extends ChildFactory { */ private String constructEscapedSolrQuery(String query, boolean literal_query) { StringBuilder highlightQuery = new StringBuilder(); - String highLightField; - if (literal_query) { - highLightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL; - } else { - highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX; - } - highlightQuery.append(highLightField).append(":").append("\"").append(KeywordSearchUtil.escapeLuceneQuery(query)).append("\""); + highlightQuery.append(LuceneQuery.HIGHLIGHT_FIELD).append(":").append("\"").append(KeywordSearchUtil.escapeLuceneQuery(query)).append("\""); return highlightQuery.toString(); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java index e24b8ec712..695165cc32 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java @@ -59,11 +59,7 @@ class LuceneQuery implements KeywordSearchQuery { private String field = null; private static final int MAX_RESULTS = 20000; static final int SNIPPET_LENGTH = 50; - //can use different highlight schema fields for regex and literal search - static final String HIGHLIGHT_FIELD_LITERAL = Server.Schema.TEXT.toString(); - static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.TEXT.toString(); - //TODO use content_ws stored="true" in solr schema for perfect highlight hits - //static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.CONTENT_WS.toString() + static final String HIGHLIGHT_FIELD = Server.Schema.TEXT.toString(); private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT); @@ -250,13 +246,15 @@ class LuceneQuery implements KeywordSearchQuery { private SolrQuery createAndConfigureSolrQuery(boolean snippets) { SolrQuery q = new SolrQuery(); q.setShowDebugInfo(DEBUG); //debug - //set query, force quotes/grouping around all literal queries - final String groupedQuery = KeywordSearchUtil.quoteQuery(keywordStringEscaped); - String theQueryStr = groupedQuery; + // Wrap the query string in quotes if this is a literal search term. + String theQueryStr = keyword.searchTermIsLiteral() + ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped; + + // Run the query against an optional alternative field. if (field != null) { //use the optional field StringBuilder sb = new StringBuilder(); - sb.append(field).append(":").append(groupedQuery); + sb.append(field).append(":").append(theQueryStr); theQueryStr = sb.toString(); } q.setQuery(theQueryStr); @@ -345,20 +343,13 @@ class LuceneQuery implements KeywordSearchQuery { public static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException { Server solrServer = KeywordSearch.getServer(); - String highlightField; - if (isRegex) { - highlightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX; - } else { - highlightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL; - } - SolrQuery q = new SolrQuery(); String queryStr; if (isRegex) { StringBuilder sb = new StringBuilder(); - sb.append(highlightField).append(":"); + sb.append(LuceneQuery.HIGHLIGHT_FIELD).append(":"); if (group) { sb.append("\""); } @@ -387,7 +378,7 @@ class LuceneQuery implements KeywordSearchQuery { String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr); q.setShowDebugInfo(DEBUG); //debug q.addFilterQuery(idQuery); - q.addHighlightField(highlightField); + q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD); //q.setHighlightSimplePre("«"); //original highlighter only //q.setHighlightSimplePost("»"); //original highlighter only q.setHighlightSnippets(1); @@ -413,7 +404,7 @@ class LuceneQuery implements KeywordSearchQuery { if (responseHighlightID == null) { return ""; } - List contentHighlights = responseHighlightID.get(highlightField); + List contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD); if (contentHighlights == null) { return ""; } else { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java index 328155a4b0..6501985dee 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java @@ -78,6 +78,19 @@ final class RegexQuery implements KeywordSearchQuery { private boolean escaped; private String escapedQuery; + // These are the valid characters that can appear immediately before a + // keyword hit. e.g. for an IP address regex we support finding the string + // ",10.0.0.0" but not "?10.0.0.0". + private static final String BOUNDARY_PREFIX_CHARS = "(\\s|\\[|\\(|,|\\:)"; //NON-NLS + + // These are the valid characters that can appear immediately after a + // keyword hit. e.g. for an IP address regex we support finding the string + // "10.0.0.0?]" but not "10.0.0.0&". + private static final String BOUNDARY_SUFFIX_CHARS = "(\\s|\\]|\\)|,|!|\\?|\\:)"; //NON-NLS + + private boolean queryStringContainsWildcardPrefix = false; + private boolean queryStringContainsWildcardSuffix = false; + /** * Constructor with query to process. * @@ -88,6 +101,14 @@ final class RegexQuery implements KeywordSearchQuery { this.keywordList = keywordList; this.keyword = keyword; this.keywordString = keyword.getSearchTerm(); + + if (this.keywordString.startsWith(".*")) { + this.queryStringContainsWildcardPrefix = true; + } + + if (this.keywordString.endsWith(".*")) { + this.queryStringContainsWildcardSuffix = true; + } } @Override @@ -120,7 +141,28 @@ final class RegexQuery implements KeywordSearchQuery { SolrQuery solrQuery = new SolrQuery(); solrQuery.setShowDebugInfo(true); //debug - solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/.*" + getQueryString() + ".*/"); + /** + * The provided regular expression may include wildcards at the + * beginning and/or end. These wildcards are used to indicate that + * the user wants to find hits for the regex that are embedded + * within other characters. For example, if we are given .*127.0.0.1.* + * as a regular expression, this will produce hits for: + * (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace). + * (b) "abc127.0.0.1def" where the IP address is surrounded by other characters. + * + * If we are given this type of regex, we do not need to add our own + * wildcards to anchor the query. Otherwise, we need to add wildcard + * anchors because Lucene string regex searches default to using ^ and $ + * to match the entire string. + */ + + // We construct the query by surrounding it with slashes (to indicate it is + // a regular expression search) and .* as anchors (if the query doesn't + // already have them). + solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/" + + (queryStringContainsWildcardPrefix ? "" : ".*") + getQueryString() + + (queryStringContainsWildcardSuffix ? "" : ".*") + "/"); + solrQuery.setRows(MAX_RESULTS); // Set the fields we want to have returned by the query. @@ -173,22 +215,50 @@ final class RegexQuery implements KeywordSearchQuery { List hits = new ArrayList<>(); final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString(); - String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); + String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS - Matcher hitMatcher = Pattern.compile(keywordString).matcher(content); + // By default, we create keyword hits on whitespace or punctuation character boundaries. + // Having a set of well defined boundary characters produces hits that can + // subsequently be matched for highlighting against the tokens produced by + // the standard tokenizer. + // This behavior can be overridden by the user if they give us a search string + // with .* at either the start and/or end of the string. This basically tells us find + // all hits instead of the ones surrounded by one of our boundary characters. + String keywordTokenRegex = + // If the given search string starts with .*, we ignore our default + // boundary prefix characters + (queryStringContainsWildcardPrefix ? "" : BOUNDARY_PREFIX_CHARS) //NON-NLS + + keywordString + // If the given search string ends with .*, we ignore our default + // boundary suffix characters + + (queryStringContainsWildcardSuffix ? "" : BOUNDARY_SUFFIX_CHARS); //NON-NLS + + Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content); while (hitMatcher.find()) { - String snippet = ""; - final String hit = hitMatcher.group(); + StringBuilder snippet = new StringBuilder(); + String hit = hitMatcher.group(); + + // Remove leading and trailing boundary characters. + if (!queryStringContainsWildcardPrefix) { + hit = hit.replaceAll("^" + BOUNDARY_PREFIX_CHARS, ""); //NON-NLS + } + if (!queryStringContainsWildcardSuffix) { + hit = hit.replaceAll(BOUNDARY_SUFFIX_CHARS + "$", ""); //NON-NLS + } + /* * If searching for credit card account numbers, do a Luhn check * on the term and discard it if it does not pass. */ if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit); - ccnMatcher.find(); - final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn")); - if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) { + if (ccnMatcher.find()) { + final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn")); + if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) { + continue; + } + } else { continue; } } @@ -199,12 +269,14 @@ final class RegexQuery implements KeywordSearchQuery { */ if (KeywordSearchSettings.getShowSnippets()) { int maxIndex = content.length() - 1; - snippet = content.substring(Integer.max(0, hitMatcher.start() - 30), Integer.max(0, hitMatcher.start() - 1)); - snippet += "<<" + hit + "<<"; - snippet += content.substring(Integer.min(maxIndex, hitMatcher.end() + 1), Integer.min(maxIndex, hitMatcher.end() + 30)); + snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 30), Integer.max(0, hitMatcher.start() - 1))); + snippet.appendCodePoint(171); + snippet.append(hit); + snippet.appendCodePoint(171); + snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() + 1), Integer.min(maxIndex, hitMatcher.end() + 30))); } - hits.add(new KeywordHit(docId, snippet, hit)); + hits.add(new KeywordHit(docId, snippet.toString(), hit)); } return hits; }