From 3d67b0aeb0c250d83c1758d608a0c2703fd1aeb4 Mon Sep 17 00:00:00 2001
From: esaunders <esaunders@basistech.com>
Date: Tue, 7 Feb 2017 16:03:06 -0500
Subject: [PATCH 1/2] Modified message displayed when we are unable to retrieve
 text for highlighting.

---
 .../src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties
index 39f8de000c..0e54dd5745 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties
@@ -66,7 +66,7 @@ ExtractedContentViewer.getTitle=Indexed Text
 ExtractedContentViewer.getSolrContent.knownFileMsg=<p style\=''font-style\:italic''>{0} is a known file (based on MD5 hash) and does not have text in the index.</p>
 ExtractedContentViewer.getSolrContent.noTxtYetMsg=<p style\=''font-style\:italic''>{0} does not have text in the index.<br/>It may have no text, not been analyzed yet, or keyword search was not enabled during ingest.</p>
 ExtractedContentViewer.getSolrContent.txtBodyItal=<span style\=''font-style\:italic''>{0}</span>
-HighlightedMatchesSource.getMarkup.noMatchMsg=<html><pre><span style\\\\\='background\\\\\:yellow'>Failed to retrieve content for keyword hit. <br />The keyword could have been in the file name. <br />Advance to another page if present, or to view the original text, choose File Text <br />in the drop down menu to the right...</span></pre></html>
+HighlightedMatchesSource.getMarkup.noMatchMsg=<span>Failed to retrieve indexed text for keyword hit. Advance to another page if present, or to view the&#10;original text, choose File Text in the drop down menu to the right.&#10;&#10;Alternatively, you may choose to extract file content and search for the hit using an external&#10;application (e.g. a text editor).</span>
 HighlightedMatchesSource.getMarkup.queryFailedMsg=<html><pre><span style\\\\\='background\\\\\:yellow'>Failed to retrieve keyword hit results. <br />Confirm that Autopsy can connect to the Solr server. <br /></span></pre></html>
 HighlightedMatchesSource.toString=Search Results
 Installer.reportPortError=Indexing server port {0} is not available.  Check if your security software does not block {1} and consider changing {2} in {3} property file in the application user folder. Then try rebooting your system if another process was causing the conflict.

From 2147162db0e19994189d43afacd4abfb2e672863 Mon Sep 17 00:00:00 2001
From: esaunders <esaunders@basistech.com>
Date: Tue, 7 Feb 2017 16:03:44 -0500
Subject: [PATCH 2/2] Removed trimming of keyword hits.

---
 .../autopsy/keywordsearch/RegexQuery.java     | 38 +++----------------
 1 file changed, 5 insertions(+), 33 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
index aa56741ec5..c434f808be 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2011-2016 Basis Technology Corp.
+ * Copyright 2011-2017 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -80,12 +80,6 @@ final class RegexQuery implements KeywordSearchQuery {
     private boolean escaped;
     private String escapedQuery;
 
-    // These are the valid characters that can appear either before or after a
-    // keyword hit. We use these characters to try to turn the hit into a
-    // token that can be more readily matched when it comes to highlighting
-    // against the Schema.TEXT field later.
-    private static final String BOUNDARY_CHARS = "[\\s\\[\\]\\(\\)\\,\\\"\\\'\\!\\?\\.\\/\\:\\;\\=\\<\\>\\^\\{\\}]"; //NON-NLS
-
     // Lucene regular expressions do not support the following Java predefined
     // and POSIX character classes. There are other valid Java character classes
     // that are not supported by Lucene but we do not check for all of them.
@@ -240,31 +234,17 @@ final class RegexQuery implements KeywordSearchQuery {
 
         final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
 
-        // By default, we create keyword hits on whitespace or punctuation character boundaries.
-        // Having a set of well defined boundary characters produces hits that can
-        // subsequently be matched for highlighting against the tokens produced by
-        // the standard tokenizer.
-        // This behavior can be overridden by the user if they give us a search string
-        // with .* at either the start and/or end of the string. This basically tells us find
-        // all hits instead of the ones surrounded by one of our boundary characters.
-        String keywordTokenRegex
-                = // If the given search string starts with .*, we ignore our default
-                // boundary prefix characters
-                (queryStringContainsWildcardPrefix ? "" : "(^|" + BOUNDARY_CHARS + ")") //NON-NLS
-                + keywordString
-                // If the given search string ends with .*, we ignore our default
-                // boundary suffix characters
-                + (queryStringContainsWildcardSuffix ? "" : "($|" + BOUNDARY_CHARS + ")"); //NON-NLS
-
         for (Object content_obj : content_str) {
             String content = (String) content_obj;
-            Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
+            Matcher hitMatcher = Pattern.compile(keywordString).matcher(content);
             int offset = 0;
 
             while (hitMatcher.find(offset)) {
                 StringBuilder snippet = new StringBuilder();
 
-                //"parent" entries in the index don't have chunk size, so just accept those hits
+                // If the location of the hit is beyond this chunk (i.e. it
+                // exists in the overlap region), we skip the hit. It will
+                // show up again as a hit in the chunk following this one.
                 if (chunkSize != null && hitMatcher.start() >= chunkSize) {
                     break;
                 }
@@ -277,14 +257,6 @@ final class RegexQuery implements KeywordSearchQuery {
                 // input where they were separated by a single boundary character.
                 offset = hitMatcher.end() - 1;
 
-                // Remove any remaining leading and trailing boundary characters.
-                if (!queryStringContainsWildcardPrefix) {
-                    hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS
-                }
-                if (!queryStringContainsWildcardSuffix) {
-                    hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS
-                }
-
                 /*
                  * If searching for credit card account numbers, do a Luhn check
                  * on the term and discard it if it does not pass.