Merge pull request #2506 from esaunders/remove-hit-trimming

Remove hit trimming
This commit is contained in:
Richard Cordovano 2017-02-07 16:26:10 -05:00 committed by GitHub
commit 405e4598a6
2 changed files with 6 additions and 34 deletions

View File

@ -66,7 +66,7 @@ ExtractedContentViewer.getTitle=Indexed Text
ExtractedContentViewer.getSolrContent.knownFileMsg=<p style\=''font-style\:italic''>{0} is a known file (based on MD5 hash) and does not have text in the index.</p>
ExtractedContentViewer.getSolrContent.noTxtYetMsg=<p style\=''font-style\:italic''>{0} does not have text in the index.<br/>It may have no text, not been analyzed yet, or keyword search was not enabled during ingest.</p>
ExtractedContentViewer.getSolrContent.txtBodyItal=<span style\=''font-style\:italic''>{0}</span>
HighlightedMatchesSource.getMarkup.noMatchMsg=<html><pre><span style\\\\\='background\\\\\:yellow'>Failed to retrieve content for keyword hit. <br />The keyword could have been in the file name. <br />Advance to another page if present, or to view the original text, choose File Text <br />in the drop down menu to the right...</span></pre></html>
HighlightedMatchesSource.getMarkup.noMatchMsg=<span>Failed to retrieve indexed text for keyword hit. Advance to another page if present, or to view the&#10;original text, choose File Text in the drop down menu to the right.&#10;&#10;Alternatively, you may choose to extract file content and search for the hit using an external&#10;application (e.g. a text editor).</span>
HighlightedMatchesSource.getMarkup.queryFailedMsg=<html><pre><span style\\\\\='background\\\\\:yellow'>Failed to retrieve keyword hit results. <br />Confirm that Autopsy can connect to the Solr server. <br /></span></pre></html>
HighlightedMatchesSource.toString=Search Results
Installer.reportPortError=Indexing server port {0} is not available. Check if your security software does not block {1} and consider changing {2} in {3} property file in the application user folder. Then try rebooting your system if another process was causing the conflict.

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Copyright 2011-2017 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -80,12 +80,6 @@ final class RegexQuery implements KeywordSearchQuery {
private boolean escaped;
private String escapedQuery;
// These are the valid characters that can appear either before or after a
// keyword hit. We use these characters to try to turn the hit into a
// token that can be more readily matched when it comes to highlighting
// against the Schema.TEXT field later.
private static final String BOUNDARY_CHARS = "[\\s\\[\\]\\(\\)\\,\\\"\\\'\\!\\?\\.\\/\\:\\;\\=\\<\\>\\^\\{\\}]"; //NON-NLS
// Lucene regular expressions do not support the following Java predefined
// and POSIX character classes. There are other valid Java character classes
// that are not supported by Lucene but we do not check for all of them.
@ -240,31 +234,17 @@ final class RegexQuery implements KeywordSearchQuery {
final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
// By default, we create keyword hits on whitespace or punctuation character boundaries.
// Having a set of well defined boundary characters produces hits that can
// subsequently be matched for highlighting against the tokens produced by
// the standard tokenizer.
// This behavior can be overridden by the user if they give us a search string
// with .* at either the start and/or end of the string. This basically tells us find
// all hits instead of the ones surrounded by one of our boundary characters.
String keywordTokenRegex
= // If the given search string starts with .*, we ignore our default
// boundary prefix characters
(queryStringContainsWildcardPrefix ? "" : "(^|" + BOUNDARY_CHARS + ")") //NON-NLS
+ keywordString
// If the given search string ends with .*, we ignore our default
// boundary suffix characters
+ (queryStringContainsWildcardSuffix ? "" : "($|" + BOUNDARY_CHARS + ")"); //NON-NLS
for (Object content_obj : content_str) {
String content = (String) content_obj;
Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
Matcher hitMatcher = Pattern.compile(keywordString).matcher(content);
int offset = 0;
while (hitMatcher.find(offset)) {
StringBuilder snippet = new StringBuilder();
//"parent" entries in the index don't have chunk size, so just accept those hits
// If the location of the hit is beyond this chunk (i.e. it
// exists in the overlap region), we skip the hit. It will
// show up again as a hit in the chunk following this one.
if (chunkSize != null && hitMatcher.start() >= chunkSize) {
break;
}
@ -277,14 +257,6 @@ final class RegexQuery implements KeywordSearchQuery {
// input where they were separated by a single boundary character.
offset = hitMatcher.end() - 1;
// Remove any remaining leading and trailing boundary characters.
if (!queryStringContainsWildcardPrefix) {
hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS
}
if (!queryStringContainsWildcardSuffix) {
hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS
}
/*
* If searching for credit card account numbers, do a Luhn check
* on the term and discard it if it does not pass.