From b0c66538ec2f64d0a38bd09a38e45a1bd382cc99 Mon Sep 17 00:00:00 2001
From: Eugene Livis <elivis@basistech.com>
Date: Mon, 2 Nov 2020 12:21:08 -0500
Subject: [PATCH] Fixed a bug in 32k chunking

---
 .../autopsy/keywordsearch/Chunker.java        | 39 +++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
index 9c9923181d..fd0f11ab74 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
@@ -185,9 +185,8 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
     }
 
     private static StringBuilder sanitize(String s) {
-        String normStr = Normalizer.normalize(s, Normalizer.Form.NFC);
+        String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
         return sanitizeToUTF8(replaceInvalidUTF16(normStr));
-
     }
 
     @Override
@@ -336,8 +335,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                 String chunkSegment;
                 if (Character.isHighSurrogate(ch)) {
                     //read another char into the buffer.
-                    charsRead = reader.read(tempChunkBuf, 1, 1);
-                    if (charsRead == -1) {
+                    int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
+                    charsRead += surrogateCharsRead;
+                    if (surrogateCharsRead == -1) {
                         //this is the last chunk, so just drop the unpaired surrogate
                         endOfReaderReached = true;
                         return;
@@ -352,17 +352,32 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
 
                 //cleanup any invalid utf-16 sequences
                 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
-                //check for whitespace.
-                whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
-                //add read chars to the chunk and update the length.
-                currentChunk.append(sanitizedChunkSegment);
-                chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
-
+                //get the length in utf8 bytes of the read chars
+                int segmentSize = chunkSegment.getBytes(UTF_8).length;
+                
                 // lower case the string and get it's size. NOTE: lower casing can 
                 // change the size of the string.
                 String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
-                lowerCasedChunk.append(lowerCasedSegment);
-                lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
+                int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
+                
+                //if it will not put us past maxBytes
+                if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
+                        && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
+
+                    //add read chars to the chunk and update the length.
+                    currentChunk.append(sanitizedChunkSegment);
+                    chunkSizeBytes += segmentSize;
+
+                    lowerCasedChunk.append(lowerCasedSegment);
+                    lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
+                    
+                    //check for whitespace.
+                    whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
+                } else {
+                    //unread it, and break out of read loop.
+                    reader.unread(tempChunkBuf, 0, charsRead);
+                    return;
+                }
             }
         }
     }