Merge pull request #2506 from esaunders/remove-hit-trimming

Remove hit trimming
2025-07-16 09:47:42 +00:00 · 2017-02-07 16:26:10 -05:00 · 2017-02-07 16:26:10 -05:00 · 405e4598a6
commit 405e4598a6
parent 5ba2efbf9d 57d1b42350
2 changed files with 6 additions and 34 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties
@ -66,7 +66,7 @@ ExtractedContentViewer.getTitle=Indexed Text
 ExtractedContentViewer.getSolrContent.knownFileMsg=<p style\=''font-style\:italic''>{0} is a known file (based on MD5 hash) and does not have text in the index.</p>
 ExtractedContentViewer.getSolrContent.noTxtYetMsg=<p style\=''font-style\:italic''>{0} does not have text in the index.<br/>It may have no text, not been analyzed yet, or keyword search was not enabled during ingest.</p>
 ExtractedContentViewer.getSolrContent.txtBodyItal=<span style\=''font-style\:italic''>{0}</span>
-HighlightedMatchesSource.getMarkup.noMatchMsg=<html><pre><span style\\\\\='background\\\\\:yellow'>Failed to retrieve content for keyword hit. <br />The keyword could have been in the file name. <br />Advance to another page if present, or to view the original text, choose File Text <br />in the drop down menu to the right...</span></pre></html>
+HighlightedMatchesSource.getMarkup.noMatchMsg=<span>Failed to retrieve indexed text for keyword hit. Advance to another page if present, or to view the&#10;original text, choose File Text in the drop down menu to the right.&#10;&#10;Alternatively, you may choose to extract file content and search for the hit using an external&#10;application (e.g. a text editor).</span>
 HighlightedMatchesSource.getMarkup.queryFailedMsg=<html><pre><span style\\\\\='background\\\\\:yellow'>Failed to retrieve keyword hit results. <br />Confirm that Autopsy can connect to the Solr server. <br /></span></pre></html>
 HighlightedMatchesSource.toString=Search Results
 Installer.reportPortError=Indexing server port {0} is not available.  Check if your security software does not block {1} and consider changing {2} in {3} property file in the application user folder. Then try rebooting your system if another process was causing the conflict.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2016 Basis Technology Corp.
+ * Copyright 2011-2017 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -80,12 +80,6 @@ final class RegexQuery implements KeywordSearchQuery {
    private boolean escaped;
    private String escapedQuery;

-    // These are the valid characters that can appear either before or after a
-    // keyword hit. We use these characters to try to turn the hit into a
-    // token that can be more readily matched when it comes to highlighting
-    // against the Schema.TEXT field later.
-    private static final String BOUNDARY_CHARS = "[\\s\\[\\]\\(\\)\\,\\\"\\\'\\!\\?\\.\\/\\:\\;\\=\\<\\>\\^\\{\\}]"; //NON-NLS
-
    // Lucene regular expressions do not support the following Java predefined
    // and POSIX character classes. There are other valid Java character classes
    // that are not supported by Lucene but we do not check for all of them.
@ -240,31 +234,17 @@ final class RegexQuery implements KeywordSearchQuery {

        final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());

-        // By default, we create keyword hits on whitespace or punctuation character boundaries.
-        // Having a set of well defined boundary characters produces hits that can
-        // subsequently be matched for highlighting against the tokens produced by
-        // the standard tokenizer.
-        // This behavior can be overridden by the user if they give us a search string
-        // with .* at either the start and/or end of the string. This basically tells us find
-        // all hits instead of the ones surrounded by one of our boundary characters.
-        String keywordTokenRegex
-                = // If the given search string starts with .*, we ignore our default
-                // boundary prefix characters
-                (queryStringContainsWildcardPrefix ? "" : "(^|" + BOUNDARY_CHARS + ")") //NON-NLS
-                + keywordString
-                // If the given search string ends with .*, we ignore our default
-                // boundary suffix characters
-                + (queryStringContainsWildcardSuffix ? "" : "($|" + BOUNDARY_CHARS + ")"); //NON-NLS
-
        for (Object content_obj : content_str) {
            String content = (String) content_obj;
-            Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
+            Matcher hitMatcher = Pattern.compile(keywordString).matcher(content);
            int offset = 0;

            while (hitMatcher.find(offset)) {
                StringBuilder snippet = new StringBuilder();

-                //"parent" entries in the index don't have chunk size, so just accept those hits
+                // If the location of the hit is beyond this chunk (i.e. it
+                // exists in the overlap region), we skip the hit. It will
+                // show up again as a hit in the chunk following this one.
                if (chunkSize != null && hitMatcher.start() >= chunkSize) {
                    break;
                }
@ -277,14 +257,6 @@ final class RegexQuery implements KeywordSearchQuery {
                // input where they were separated by a single boundary character.
                offset = hitMatcher.end() - 1;

-                // Remove any remaining leading and trailing boundary characters.
-                if (!queryStringContainsWildcardPrefix) {
-                    hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS
-                }
-                if (!queryStringContainsWildcardSuffix) {
-                    hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS
-                }
-
                /*
                 * If searching for credit card account numbers, do a Luhn check
                 * on the term and discard it if it does not pass.