From 939d2dc4e3a37151d691d8364d44f9817556fd64 Mon Sep 17 00:00:00 2001
From: Eugene Livis <elivis@basistech.com>
Date: Thu, 3 Oct 2019 12:48:44 -0400
Subject: [PATCH] Bug fix in KWS chunking

---
 .../autopsy/keywordsearch/Chunker.java        | 141 ++++++++++++------
 .../autopsy/keywordsearch/Ingester.java       |  15 +-
 2 files changed, 103 insertions(+), 53 deletions(-)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
index 82494f2f0d..08ca0ab511 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
@@ -45,33 +45,69 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
     private static final Charset UTF_8 = StandardCharsets.UTF_8;
 
     //Chunking algorithm paramaters-------------------------------------//
-    /** the maximum size of a chunk, including the window. */
+    /**
+     * the maximum size of a chunk, including the window.
+     */
     private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
-    /** the minimum to read before we start the process of looking for
-     * whitespace to break at and creating an overlapping window. */
+    /**
+     * the minimum to read before we start the process of looking for whitespace
+     * to break at and creating an overlapping window.
+     */
     private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
-    /** The maximum size of the chunk, before the overlapping window, even if we
-     * couldn't find whitespace to break at. */
+    /**
+     * The maximum size of the chunk, before the overlapping window, even if we
+     * couldn't find whitespace to break at.
+     */
     private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
-    /** The amount of text we will read through before we give up on finding
-     * whitespace to break the chunk/window at. */
+    /**
+     * The amount of text we will read through before we give up on finding
+     * whitespace to break the chunk/window at.
+     */
     private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
-    /** The number of characters to read in one go from the Reader. */
+    /**
+     * The number of characters to read in one go from the Reader.
+     */
     private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
+    /**
+     * When toLowerCase() is called on a character, the lower cased output 
+     * can be different in size than the original input. I have seen a single
+     * input character turn into 3 characters (and 5 bytes) after lowercasing. 
+     * I could not find any info as to what is the upper limit of how much a 
+     * character can "increase in size" during lower casing. I'm guestimating
+     * and setting that limit at 10 bytes.
+     */
+    private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes
 
     ////chunker state--------------------------------------------///
-    /** The Reader that this chunk reads from, and divides into chunks. It must
-     * be a buffered reader to ensure that mark/reset are supported. */
+    /**
+     * The Reader that this chunk reads from, and divides into chunks. It must
+     * be a buffered reader to ensure that mark/reset are supported.
+     */
     private final PushbackReader reader;
-    /** The local buffer of characters read from the Reader. */
+    /**
+     * The local buffer of characters read from the Reader.
+     */
     private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
 
-    /** the size in bytes of the chunk (so far). */
+    /**
+     * the size in bytes of the chunk (so far).
+     */
     private int chunkSizeBytes = 0;
-    /** Has the chunker reached the end of the Reader? If so, there are no more
-     * chunks, and the current chunk does not need a window. */
+
+    /**
+     * the size in bytes of the lowercased chunk (so far). Note that lowercasing
+     * in Java can change the size of the string so we need to make sure the
+     * lowercased string also fits in MAX_TOTAL_CHUNK_SIZE.
+     */
+    private int lowerCasedChunkSizeBytes = 0;
+    /**
+     * Has the chunker reached the end of the Reader? If so, there are no more
+     * chunks, and the current chunk does not need a window.
+     */
     private boolean endOfReaderReached = false;
-    /** Store any exception encountered reading from the Reader. */
+    /**
+     * Store any exception encountered reading from the Reader.
+     */
     private Exception ex;
 
     /**
@@ -140,7 +176,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
      * @param s The string to cleanup.
      *
      * @return A StringBuilder with the same content as s but where all invalid
-     *         code     *         points have been replaced.
+     * code * points have been replaced.
      */
     private static StringBuilder replaceInvalidUTF16(String s) {
         /* encode the string to UTF-16 which does the replcement, see
@@ -162,16 +198,18 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
         //reset state for the next chunk
 
         chunkSizeBytes = 0;
+        lowerCasedChunkSizeBytes = 0;
         int baseChunkSizeChars = 0;
         StringBuilder currentChunk = new StringBuilder();
         StringBuilder currentWindow = new StringBuilder();
+        StringBuilder lowerCasedChunk = new StringBuilder();
 
         try {
-            currentChunk.append(readBaseChunk());
+            readBaseChunk(currentChunk, lowerCasedChunk);
             baseChunkSizeChars = currentChunk.length(); //save the base chunk length
-            currentWindow.append(readWindow());
-                //add the window text to the current chunk.
-        currentChunk.append(currentWindow);
+            readWindow(currentWindow, lowerCasedChunk);
+            //add the window text to the current chunk.
+            currentChunk.append(currentWindow);
             if (endOfReaderReached) {
                 /* if we have reached the end of the content,we won't make
                  * another overlapping chunk, so the length of the base chunk
@@ -186,9 +224,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
              * and break any chunking loop in client code. */
             ex = ioEx;
         }
-    
+
         //sanitize the text and return a Chunk object, that includes the base chunk length.
-        return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
+        return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
     }
 
     /**
@@ -196,14 +234,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
      *
      * @throws IOException if there is a problem reading from the reader.
      */
-    private StringBuilder readBaseChunk() throws IOException {
-        StringBuilder currentChunk = new StringBuilder();
+    private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
         //read the chunk until the minimum base chunk size
-        readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
+        readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
 
         //keep reading until the maximum base chunk size or white space is reached.
-        readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
-        return currentChunk;
+        readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
     }
 
     /**
@@ -211,14 +247,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
      *
      * @throws IOException if there is a problem reading from the reader.
      */
-    private StringBuilder readWindow() throws IOException {
-        StringBuilder currentWindow = new StringBuilder();
+    private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
         //read the window, leaving some room to look for white space to break at.
-        readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
+        readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
 
         //keep reading until the max chunk size, or until whitespace is reached.
-        readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
-        return currentWindow;
+        readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
     }
 
     /**
@@ -229,10 +263,10 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
      *
      * @throws IOException
      */
-    private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
+    private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException {
         int charsRead = 0;
         //read chars up to maxBytes, or the end of the reader.
-        while ((chunkSizeBytes < maxBytes)
+        while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
                 && (endOfReaderReached == false)) {
             charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
             if (-1 == charsRead) {
@@ -253,11 +287,19 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                 //get the length in utf8 bytes of the read chars
                 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
 
+                // lower case the string and get it's size. NOTE: lower casing can 
+                // change the size of the string!
+                String lowerCasedSegment = chunkSegment.toString().toLowerCase();
+                int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
+
                 //if it will not put us past maxBytes
-                if (chunkSizeBytes + segmentSize < maxBytes) {
+                if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
                     //add it to the chunk
                     currentSegment.append(chunkSegment);
                     chunkSizeBytes += segmentSize;
+
+                    currentLowerCasedSegment.append(lowerCasedSegment);
+                    lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
                 } else {
                     //unread it, and break out of read loop.
                     reader.unread(tempChunkBuf, 0, charsRead);
@@ -275,11 +317,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
      *
      * @throws IOException
      */
-    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
+    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
         int charsRead = 0;
         boolean whitespaceFound = false;
         //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
-        while ((chunkSizeBytes < maxBytes)
+        while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES) 
+                && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
                 && (whitespaceFound == false)
                 && (endOfReaderReached == false)) {
             charsRead = reader.read(tempChunkBuf, 0, 1);
@@ -314,6 +357,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                 //add read chars to the chunk and update the length.
                 currentChunk.append(sanitizedChunkSegment);
                 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
+
+                // lower case the string and get it's size. NOTE: lower casing can 
+                // change the size of the string.
+                String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
+                lowerCasedChunk.append(lowerCasedSegment);
+                lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
             }
         }
     }
@@ -326,16 +375,16 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
 
         private final StringBuilder sb;
         private final int baseChunkSizeChars;
-        private final int chunkSizeBytes;
+        private final StringBuilder lowerCasedChunk;
 
-        Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
+        Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
             this.sb = sb;
             this.baseChunkSizeChars = baseChunkSizeChars;
-            this.chunkSizeBytes = chunkSizeBytes;
+            this.lowerCasedChunk = lowerCasedChunk;
         }
 
         /**
-         * Get the content of the chunk.
+         * Get the content of the original (non-lower cased) chunk.
          *
          * @return The content of the chunk.
          */
@@ -345,16 +394,16 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
         }
 
         /**
-         * Get the size in bytes of the utf-8 encoding of the entire chunk.
+         * Get the content of the lower cased chunk.
          *
-         * @return the size in bytes of the utf-8 encoding of the entire chunk
+         * @return The content of the chunk.
          */
-        public int getChunkSizeBytes() {
-            return chunkSizeBytes;
+        public String geLowerCasedChunk() {
+            return lowerCasedChunk.toString();
         }
 
         /**
-         * Get the length of the base chunk in java chars.
+         * Get the length of the original (non-lower cased) base chunk in java chars.
          *
          * @return the length of the base chunk in java chars.
          */
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index be0b93088d..576b65d581 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -97,7 +97,7 @@ class Ingester {
      *                           file, but the Solr server is probably fine.
      */
     void indexMetaDataOnly(AbstractFile file) throws IngesterException {
-        indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
+        indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
     }
 
     /**
@@ -111,7 +111,7 @@ class Ingester {
      *                           artifact, but the Solr server is probably fine.
      */
     void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
-        indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
+        indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));
     }
 
     /**
@@ -156,7 +156,7 @@ class Ingester {
                     logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
                     return false;
                 }
-
+                
                 Chunk chunk = chunker.next();
                 Map<String, Object> fields = new HashMap<>(contentFields);
                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
@@ -166,7 +166,7 @@ class Ingester {
                 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
                 try {
                     //add the chunk text to Solr index
-                    indexChunk(chunk.toString(), sourceName, fields);
+                    indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields);
                     // add mini chunk when there's a language specific field
                     if (chunker.hasNext() && language.isPresent()) {
                         languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
@@ -197,7 +197,7 @@ class Ingester {
                 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
                 //"parent" docs don't have chunk_size
                 fields.remove(Server.Schema.CHUNK_SIZE.toString());
-                indexChunk(null, sourceName, fields);
+                indexChunk(null, null, sourceName, fields);
             }
         }
         return true;
@@ -211,12 +211,13 @@ class Ingester {
      * 4.0.0), see if possible to stream with UpdateRequestHandler
      *
      * @param chunk  The chunk content as a string, or null for metadata only
+     * @param lowerCasedChunk The lower cased chunk content as a string, or null for metadata only
      * @param fields
      * @param size
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
+    private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
             //JMTODO: actually if the we couldn't get the image id it is set to -1,
             // but does this really mean we don't want to index it?
@@ -245,7 +246,7 @@ class Ingester {
             // insensitive substring/regular expression search.
             double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
             if (indexSchemaVersion >= 2.1) {
-                updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : chunk.toLowerCase()));
+                updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));
             }
 
             TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");