From 3f9b161a71fbb7c7d9504ba2b03d9503953da03c Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 1 Feb 2017 16:48:52 +0100
Subject: [PATCH 1/3] look at the values of content_str field separately:
 title/content

---
 .../autopsy/keywordsearch/LuceneQuery.java    |  16 +-
 .../autopsy/keywordsearch/RegexQuery.java     | 139 +++++++++---------
 2 files changed, 79 insertions(+), 76 deletions(-)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
index 9e99394a34..6a90a688dc 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@@ -23,7 +23,6 @@ import java.util.Collection;
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
 import java.util.logging.Level;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.math.NumberUtils;
@@ -223,18 +222,21 @@ class LuceneQuery implements KeywordSearchQuery {
                      * will get picked up in the next one. */
                     final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
                     final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
-                    String content_str = Objects.toString(resultDoc.get(Server.Schema.CONTENT_STR.toString()), null);
+                    final ArrayList<String> get = (ArrayList<String>) resultDoc.get(Server.Schema.CONTENT_STR.toString());
 
                     double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
                     if (indexSchemaVersion < 2.0) {
                         //old schema versions don't support chunk_size or the content_str fields, so just accept hits
                         matches.add(createKeywordtHit(highlightResponse, docId));
                     } else {
-                        //for new schemas, check that the hit is before the chunk/window boundary.
-                        int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString);
-                        //there is no chunksize field for "parent" entries in the index
-                        if (chunkSize != null && firstOccurence < chunkSize) {
-                            matches.add(createKeywordtHit(highlightResponse, docId));
+                        //check against file name and actual content seperately.
+                        for (String content_str : get) {
+                            //for new schemas, check that the hit is before the chunk/window boundary.
+                            int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString);
+                            //there is no chunksize field for "parent" entries in the index
+                            if (chunkSize != null && firstOccurence > -1 && firstOccurence < chunkSize) {
+                                matches.add(createKeywordtHit(highlightResponse, docId));
+                            }
                         }
                     }
                 } catch (TskException ex) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
index 2a8f828a40..f188890591 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
@@ -54,16 +54,16 @@ import org.sleuthkit.datamodel.TskData;
 import org.sleuthkit.datamodel.TskException;
 
 /**
- * The RegexQuery class supports issuing regular expression queries
- * against a Lucene index. It relies on the fact that content is
- * stored in it's original form in a "string" field (Server.Schema.CONTENT_STR).
- * To indicate to Lucene that these are regular expression queries, the query
- * string must be surrounded by '/' characters. Additionally, the characters
- * ".*" need to be added both before and after the search term to get hits
- * in the middle of text.
+ * The RegexQuery class supports issuing regular expression queries against a
+ * Lucene index. It relies on the fact that content is stored in it's original
+ * form in a "string" field (Server.Schema.CONTENT_STR). To indicate to Lucene
+ * that these are regular expression queries, the query string must be
+ * surrounded by '/' characters. Additionally, the characters ".*" need to be
+ * added both before and after the search term to get hits in the middle of
+ * text.
  *
- * Regular expression syntax supported by Lucene is not the same as Java
- * regular expression syntax. The Lucene syntax is documented here:
+ * Regular expression syntax supported by Lucene is not the same as Java regular
+ * expression syntax. The Lucene syntax is documented here:
  *
  * https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/util/automaton/RegExp.html
  */
@@ -94,7 +94,7 @@ final class RegexQuery implements KeywordSearchQuery {
     // See https://lucene.apache.org/core/6_4_0/core/org/apache/lucene/util/automaton/RegExp.html
     // for Lucene syntax.
     // We use \p as a shortcut for all of the character classes of the form \p{XXX}.
-    private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n", 
+    private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n",
         "\\t", "\\r", "\\f", "\\a", "\\e", "\\v", "\\V", "\\h", "\\H", "\\p"}; //NON-NLS
 
     private boolean queryStringContainsWildcardPrefix = false;
@@ -160,19 +160,18 @@ final class RegexQuery implements KeywordSearchQuery {
 
         /**
          * The provided regular expression may include wildcards at the
-         * beginning and/or end. These wildcards are used to indicate that
-         * the user wants to find hits for the regex that are embedded
-         * within other characters. For example, if we are given .*127.0.0.1.*
-         * as a regular expression, this will produce hits for:
-         * (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace).
-         * (b) "abc127.0.0.1def" where the IP address is surrounded by other characters.
+         * beginning and/or end. These wildcards are used to indicate that the
+         * user wants to find hits for the regex that are embedded within other
+         * characters. For example, if we are given .*127.0.0.1.* as a regular
+         * expression, this will produce hits for: (a) " 127.0.0.1 " as a
+         * standalone token (surrounded by whitespace). (b) "abc127.0.0.1def"
+         * where the IP address is surrounded by other characters.
          *
          * If we are given this type of regex, we do not need to add our own
          * wildcards to anchor the query. Otherwise, we need to add wildcard
          * anchors because Lucene string regex searches default to using ^ and $
          * to match the entire string.
          */
-
         // We construct the query by surrounding it with slashes (to indicate it is
         // a regular expression search) and .* as anchors (if the query doesn't
         // already have them).
@@ -193,7 +192,7 @@ final class RegexQuery implements KeywordSearchQuery {
         solrQuery.setRows(MAX_RESULTS);
         // Setting the sort order is necessary for cursor based paging to work.
         solrQuery.setSort(SortClause.asc(Server.Schema.ID.toString()));
-        
+
         String cursorMark = CursorMarkParams.CURSOR_MARK_START;
         SolrDocumentList resultList = null;
         boolean allResultsProcessed = false;
@@ -214,7 +213,7 @@ final class RegexQuery implements KeywordSearchQuery {
                         //
                     }
                 }
-                
+
                 String nextCursorMark = response.getNextCursorMark();
                 if (cursorMark.equals(nextCursorMark)) {
                     allResultsProcessed = true;
@@ -225,7 +224,7 @@ final class RegexQuery implements KeywordSearchQuery {
                 MessageNotifyUtil.Notify.error(NbBundle.getMessage(Server.class, "Server.query.exception.msg", keywordString), ex.getCause().getMessage());
             }
         }
-        
+
         for (Keyword k : hitsMultMap.keySet()) {
             results.addResult(k, hitsMultMap.get(k));
         }
@@ -239,7 +238,7 @@ final class RegexQuery implements KeywordSearchQuery {
         final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
         final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
 
-        String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS
+        ArrayList<String> content_str = (ArrayList<String>) solrDoc.get(Server.Schema.CONTENT_STR.toString());
 
         // By default, we create keyword hits on whitespace or punctuation character boundaries.
         // Having a set of well defined boundary characters produces hits that can
@@ -248,8 +247,8 @@ final class RegexQuery implements KeywordSearchQuery {
         // This behavior can be overridden by the user if they give us a search string
         // with .* at either the start and/or end of the string. This basically tells us find
         // all hits instead of the ones surrounded by one of our boundary characters.
-        String keywordTokenRegex =
-                // If the given search string starts with .*, we ignore our default
+        String keywordTokenRegex
+                = // If the given search string starts with .*, we ignore our default
                 // boundary prefix characters
                 (queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS
                 + keywordString
@@ -257,63 +256,65 @@ final class RegexQuery implements KeywordSearchQuery {
                 // boundary suffix characters
                 + (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS
 
-        Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
-        int offset = 0;
+        for (String content : content_str) {
+            Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
+            int offset = 0;
 
-        while (hitMatcher.find(offset)) {
-            StringBuilder snippet = new StringBuilder();
+            while (hitMatcher.find(offset)) {
+                StringBuilder snippet = new StringBuilder();
 
-            //"parent" entries in the index don't have chunk size, so just accept those hits
-            if (chunkSize != null && hitMatcher.start() >= chunkSize) {
-                break;
-            }
+                //"parent" entries in the index don't have chunk size, so just accept those hits
+                if (chunkSize != null && hitMatcher.start() >= chunkSize) {
+                    break;
+                }
 
-            String hit = hitMatcher.group();
+                String hit = hitMatcher.group();
 
-            // Back the matcher offset up by 1 character as it will have eaten
-            // a single space/newline/other boundary character at the end of the hit.
-            // This was causing us to miss hits that appeared consecutively in the
-            // input where they were separated by a single boundary character.
-            offset = hitMatcher.end() - 1;
+                // Back the matcher offset up by 1 character as it will have eaten
+                // a single space/newline/other boundary character at the end of the hit.
+                // This was causing us to miss hits that appeared consecutively in the
+                // input where they were separated by a single boundary character.
+                offset = hitMatcher.end() - 1;
 
-            // Remove any remaining leading and trailing boundary characters.
-            if (!queryStringContainsWildcardPrefix) {
-                hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS
-            }
-            if (!queryStringContainsWildcardSuffix) {
-                hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS
-            }
+                // Remove any remaining leading and trailing boundary characters.
+                if (!queryStringContainsWildcardPrefix) {
+                    hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS
+                }
+                if (!queryStringContainsWildcardSuffix) {
+                    hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS
+                }
 
-            /*
-             * If searching for credit card account numbers, do a Luhn check
-             * on the term and discard it if it does not pass.
-             */
-            if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
-                Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
-                if (ccnMatcher.find()) {
-                    final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
-                    if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
+                /*
+                 * If searching for credit card account numbers, do a Luhn check
+                 * on the term and discard it if it does not pass.
+                 */
+                if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
+                    Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
+                    if (ccnMatcher.find()) {
+                        final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
+                        if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
+                            continue;
+                        }
+                    } else {
                         continue;
                     }
-                } else {
-                    continue;
                 }
-            }
 
-            /**
-             * Get the snippet from the document if keyword search is configured
-             * to use snippets.
-             */
-            if (KeywordSearchSettings.getShowSnippets()) {
-                int maxIndex = content.length() - 1;
-                snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1)));
-                snippet.appendCodePoint(171);
-                snippet.append(hit);
-                snippet.appendCodePoint(171);
-                snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20)));
-            }
+                /**
+                 * Get the snippet from the document if keyword search is
+                 * configured to use snippets.
+                 */
+                if (KeywordSearchSettings.getShowSnippets()) {
+                    int maxIndex = content.length() - 1;
+                    snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1)));
+                    snippet.appendCodePoint(171);
+                    snippet.append(hit);
+                    snippet.appendCodePoint(171);
+                    snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20)));
+                }
 
-            hits.add(new KeywordHit(docId, snippet.toString(), hit));
+                hits.add(new KeywordHit(docId, snippet.toString(), hit));
+            }
         }
         return hits;
     }

From 7dea03bf599b18f2e45efe8adc77aa7f89e16a83 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Thu, 2 Feb 2017 15:12:40 +0100
Subject: [PATCH 2/3] allow regex queries to match at start and end of content.
  fix seperation of file name and content

---
 .../org/sleuthkit/autopsy/keywordsearch/RegexQuery.java  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
index f188890591..aa56741ec5 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
@@ -238,7 +238,7 @@ final class RegexQuery implements KeywordSearchQuery {
         final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
         final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
 
-        ArrayList<String> content_str = (ArrayList<String>) solrDoc.get(Server.Schema.CONTENT_STR.toString());
+        final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
 
         // By default, we create keyword hits on whitespace or punctuation character boundaries.
         // Having a set of well defined boundary characters produces hits that can
@@ -250,13 +250,14 @@ final class RegexQuery implements KeywordSearchQuery {
         String keywordTokenRegex
                 = // If the given search string starts with .*, we ignore our default
                 // boundary prefix characters
-                (queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS
+                (queryStringContainsWildcardPrefix ? "" : "(^|" + BOUNDARY_CHARS + ")") //NON-NLS
                 + keywordString
                 // If the given search string ends with .*, we ignore our default
                 // boundary suffix characters
-                + (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS
+                + (queryStringContainsWildcardSuffix ? "" : "($|" + BOUNDARY_CHARS + ")"); //NON-NLS
 
-        for (String content : content_str) {
+        for (Object content_obj : content_str) {
+            String content = (String) content_obj;
             Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
             int offset = 0;
 

From 7b60e10235f9c47f758517b8cc9e5ccc9386dc0d Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Thu, 2 Feb 2017 15:13:31 +0100
Subject: [PATCH 3/3] look at file name even if the indexed text is 0 bytes. 
 fix seperation of file name and content.

---
 .../org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
index 6a90a688dc..b51f269f2e 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@@ -222,7 +222,7 @@ class LuceneQuery implements KeywordSearchQuery {
                      * will get picked up in the next one. */
                     final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
                     final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
-                    final ArrayList<String> get = (ArrayList<String>) resultDoc.get(Server.Schema.CONTENT_STR.toString());
+                    final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
 
                     double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
                     if (indexSchemaVersion < 2.0) {
@@ -230,11 +230,12 @@ class LuceneQuery implements KeywordSearchQuery {
                         matches.add(createKeywordtHit(highlightResponse, docId));
                     } else {
                         //check against file name and actual content seperately.
-                        for (String content_str : get) {
+                        for (Object content_obj : content) {
+                            String content_str = (String) content_obj;
                             //for new schemas, check that the hit is before the chunk/window boundary.
-                            int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString);
+                            int firstOccurence = StringUtils.indexOf(content_str.toLowerCase(), strippedQueryString.toLowerCase());
                             //there is no chunksize field for "parent" entries in the index
-                            if (chunkSize != null && firstOccurence > -1 && firstOccurence < chunkSize) {
+                            if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) {
                                 matches.add(createKeywordtHit(highlightResponse, docId));
                             }
                         }