From 5482fb4979356e86cb804912b6321725f49f58f9 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 1 Feb 2017 12:19:27 +0100
Subject: [PATCH] first pass at sanitizing the utf-16 as we read it.

---
 .../autopsy/keywordsearch/Chunker.java        | 56 ++++++++++++++-----
 .../autopsy/keywordsearch/Ingester.java       |  1 +
 2 files changed, 43 insertions(+), 14 deletions(-)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
index d2967ce400..10f79be15f 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
@@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.PushbackReader;
 import java.io.Reader;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
 import java.util.NoSuchElementException;
@@ -38,6 +39,9 @@ import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
 @NotThreadSafe
 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
 
+    private static final Charset UTF_16 = StandardCharsets.UTF_16;
+    private static final Charset UTF_8 = StandardCharsets.UTF_8;
+
     //Chunking algorithm paramaters-------------------------------------//
     /** the maximum size of a chunk, including the window. */
     private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
@@ -157,7 +161,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
         //add the window text to the current chunk.
         currentChunk.append(currentWindow);
         //sanitize the text and return a Chunk object, that includes the base chunk length.
-        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
+        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars, chunkSizeBytes);
     }
 
     /**
@@ -216,10 +220,10 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                     reader.unread(lastChar);
                 }
 
-                String chunkSegment = new String(tempChunkBuf, 0, charsRead);
+                String chunkSegment = stripInvalidUTF16(new String(tempChunkBuf, 0, charsRead));
 
                 //get the length in bytes of the read chars
-                int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length;
+                int segmentSize = chunkSegment.getBytes(UTF_8).length;
 
                 //if it will not put us past maxBytes
                 if (chunkSizeBytes + segmentSize < maxBytes) {
@@ -235,15 +239,19 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
         }
     }
 
+    private static String stripInvalidUTF16(String chunkSegment) {
+        return UTF_16.decode(UTF_16.encode(chunkSegment)).toString();
+    }
+
     /**
      * Read until the maxBytes reached, whitespace, or end of reader.
      *
      * @param maxBytes
-     * @param currentSegment
+     * @param currentChunk
      *
      * @throws IOException
      */
-    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
+    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
         int charsRead = 0;
         boolean whitespaceFound = false;
         //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
@@ -262,9 +270,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                 if (Character.isHighSurrogate(ch)) {
                     charsRead = reader.read(tempChunkBuf, 1, 1);
                     if (charsRead == -1) {
-                        //this is the last chunk, so include the unpaired surrogate
-                        currentSegment.append(ch);
-                        chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length;
+                        //this is the last chunk, so drop the unpaired surrogate
                         endOfReaderReached = true;
                         return;
                     } else {
@@ -275,11 +281,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                     //one char
                     chunkSegment = new String(tempChunkBuf, 0, 1);
                 }
+                chunkSegment = stripInvalidUTF16(chunkSegment);
                 //check for whitespace.
                 whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0));
                 //add read chars to the chunk and update the length.
-                currentSegment.append(chunkSegment);
-                chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length;
+                currentChunk.append(chunkSegment);
+                chunkSizeBytes += chunkSegment.getBytes(UTF_8).length;
             }
         }
     }
@@ -291,20 +298,41 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
     static class Chunk {
 
         private final StringBuilder sb;
-        private final int chunksize;
+        private final int baseChunkSizeChars;
+        private final int chunkSizeBytes;
 
-        Chunk(StringBuilder sb, int baseChunkLength) {
+        Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
             this.sb = sb;
-            this.chunksize = baseChunkLength;
+            this.baseChunkSizeChars = baseChunkSizeChars;
+            this.chunkSizeBytes = chunkSizeBytes;
         }
 
+        /**
+         * Get the content of the chunk.
+         *
+         * @return The content of the chunk.
+         */
         @Override
         public String toString() {
             return sb.toString();
         }
 
+        /**
+         * Get the size in bytes of the utf-8 encoding of the entire chunk.
+         *
+         * @return the size in bytes of the utf-8 encoding of the entire chunk
+         */
+        public int getChunkSizeBytes() {
+            return chunkSizeBytes;
+        }
+
+        /**
+         * Get the length of the base chunk in java chars.
+         *
+         * @return the length of the base chunk in java chars.
+         */
         int getBaseChunkLength() {
-            return chunksize;
+            return baseChunkSizeChars;
         }
     }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index d5c68b6944..45c1dad4f6 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -178,6 +178,7 @@ class Ingester {
             //after all chunks, index just the meta data, including the  numChunks, of the parent file
             fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
             fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
+           fields.remove(Server.Schema.CHUNK_SIZE.toString());
             indexChunk(null, sourceName, fields);
         }