Merge pull request #2502 from millmanorama/2259-sanitize-utf16-with-Charset-encoder

2259 sanitize utf16 with charset encoder
2025-07-16 17:57:43 +00:00 · 2017-02-06 10:22:34 -05:00 · 2017-02-06 10:22:34 -05:00 · f9c2bcadc6
commit f9c2bcadc6
parent 3f92ff0411 0747ae3642
2 changed files with 75 additions and 21 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2016 Basis Technology Corp.
+ * Copyright 2011-2017 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.PushbackReader;
 import java.io.Reader;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
 import java.util.NoSuchElementException;
@ -38,6 +39,10 @@ import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
@NotThreadSafe
 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {

+    //local references to standard encodings
+    private static final Charset UTF_16 = StandardCharsets.UTF_16;
+    private static final Charset UTF_8 = StandardCharsets.UTF_8;
+
    //Chunking algorithm paramaters-------------------------------------//
    /** the maximum size of a chunk, including the window. */
    private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
@ -84,7 +89,10 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
    }

    /**
-     * Has this Chunker encountered an exception reading from the Reader.
+     * Has this Chunker encountered an exception reading from the Reader?
+     *
+     *
+     * @return True if this Chunker encountered an exception.
     */
    boolean hasException() {
        return ex != null;
@ -124,6 +132,25 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
        return sb;
    }

+    /**
+     * Cleanup invalid codepoint sequences by replacing them with the default
+     * replacement character: U+FFFD / <EFBFBD>.
+     *
+     * @param s The string to cleanup.
+     *
+     * @return A StringBuilder with the same content as s but where all invalid
+     *         code     *         points have been replaced.
+     */
+    private static StringBuilder replaceInvalidUTF16(String s) {
+        /* encode the string to UTF-16 which does the replcement, see
+         * Charset.encode(), then decode back to a StringBuilder. */
+        return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
+    }
+
+    private static StringBuilder sanitize(String s) {
+        return sanitizeToUTF8(replaceInvalidUTF16(s));
+    }
+
    @Override
    public Chunk next() {
        if (hasNext() == false) {
@ -157,7 +184,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
        //add the window text to the current chunk.
        currentChunk.append(currentWindow);
        //sanitize the text and return a Chunk object, that includes the base chunk length.
-        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
+        return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
    }

    /**
@ -216,10 +243,11 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                    reader.unread(lastChar);
                }

-                String chunkSegment = new String(tempChunkBuf, 0, charsRead);
+                //cleanup any invalid utf-16 sequences
+                StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));

-                //get the length in bytes of the read chars
-                int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length;
+                //get the length in utf8 bytes of the read chars
+                int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;

                //if it will not put us past maxBytes
                if (chunkSizeBytes + segmentSize < maxBytes) {
@ -239,11 +267,11 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
     * Read until the maxBytes reached, whitespace, or end of reader.
     *
     * @param maxBytes
-     * @param currentSegment
+     * @param currentChunk
     *
     * @throws IOException
     */
-    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
+    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
        int charsRead = 0;
        boolean whitespaceFound = false;
        //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
@ -260,26 +288,28 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
                final char ch = tempChunkBuf[0];
                String chunkSegment;
                if (Character.isHighSurrogate(ch)) {
+                    //read another char into the buffer.
                    charsRead = reader.read(tempChunkBuf, 1, 1);
                    if (charsRead == -1) {
-                        //this is the last chunk, so include the unpaired surrogate
-                        currentSegment.append(ch);
-                        chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length;
+                        //this is the last chunk, so just drop the unpaired surrogate
                        endOfReaderReached = true;
                        return;
                    } else {
-                        //use the surrogate pair in place of the unpaired surrogate.
+                        //try to use the pair together.
                        chunkSegment = new String(tempChunkBuf, 0, 2);
                    }
                } else {
                    //one char
                    chunkSegment = new String(tempChunkBuf, 0, 1);
                }
+
+                //cleanup any invalid utf-16 sequences
+                StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
                //check for whitespace.
-                whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0));
+                whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
                //add read chars to the chunk and update the length.
-                currentSegment.append(chunkSegment);
-                chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length;
+                currentChunk.append(sanitizedChunkSegment);
+                chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
            }
        }
    }
@ -291,20 +321,41 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
    static class Chunk {

        private final StringBuilder sb;
-        private final int chunksize;
+        private final int baseChunkSizeChars;
+        private final int chunkSizeBytes;

-        Chunk(StringBuilder sb, int baseChunkLength) {
+        Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
            this.sb = sb;
-            this.chunksize = baseChunkLength;
+            this.baseChunkSizeChars = baseChunkSizeChars;
+            this.chunkSizeBytes = chunkSizeBytes;
        }

+        /**
+         * Get the content of the chunk.
+         *
+         * @return The content of the chunk.
+         */
        @Override
        public String toString() {
            return sb.toString();
        }

+        /**
+         * Get the size in bytes of the utf-8 encoding of the entire chunk.
+         *
+         * @return the size in bytes of the utf-8 encoding of the entire chunk
+         */
+        public int getChunkSizeBytes() {
+            return chunkSizeBytes;
+        }
+
+        /**
+         * Get the length of the base chunk in java chars.
+         *
+         * @return the length of the base chunk in java chars.
+         */
        int getBaseChunkLength() {
-            return chunksize;
+            return baseChunkSizeChars;
        }
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2016 Basis Technology Corp.
+ * Copyright 2011-2017 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -177,7 +177,10 @@ class Ingester {
        } finally {
            //after all chunks, index just the meta data, including the  numChunks, of the parent file
            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
-            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
+            //reset id field to base document id
+            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
+            //"parent" docs don't have chunk_size
+            fields.remove(Server.Schema.CHUNK_SIZE.toString());
            indexChunk(null, sourceName, fields);
        }