From 5482fb4979356e86cb804912b6321725f49f58f9 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Wed, 1 Feb 2017 12:19:27 +0100 Subject: [PATCH] first pass at sanitizing the utf-16 as we read it. --- .../autopsy/keywordsearch/Chunker.java | 56 ++++++++++++++----- .../autopsy/keywordsearch/Ingester.java | 1 + 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index d2967ce400..10f79be15f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.PushbackReader; import java.io.Reader; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.NoSuchElementException; @@ -38,6 +39,9 @@ import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; @NotThreadSafe class Chunker implements Iterator, Iterable { + private static final Charset UTF_16 = StandardCharsets.UTF_16; + private static final Charset UTF_8 = StandardCharsets.UTF_8; + //Chunking algorithm paramaters-------------------------------------// /** the maximum size of a chunk, including the window. */ private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes @@ -157,7 +161,7 @@ class Chunker implements Iterator, Iterable { //add the window text to the current chunk. currentChunk.append(currentWindow); //sanitize the text and return a Chunk object, that includes the base chunk length. - return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); + return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars, chunkSizeBytes); } /** @@ -216,10 +220,10 @@ class Chunker implements Iterator, Iterable { reader.unread(lastChar); } - String chunkSegment = new String(tempChunkBuf, 0, charsRead); + String chunkSegment = stripInvalidUTF16(new String(tempChunkBuf, 0, charsRead)); //get the length in bytes of the read chars - int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length; + int segmentSize = chunkSegment.getBytes(UTF_8).length; //if it will not put us past maxBytes if (chunkSizeBytes + segmentSize < maxBytes) { @@ -235,15 +239,19 @@ class Chunker implements Iterator, Iterable { } } + private static String stripInvalidUTF16(String chunkSegment) { + return UTF_16.decode(UTF_16.encode(chunkSegment)).toString(); + } + /** * Read until the maxBytes reached, whitespace, or end of reader. * * @param maxBytes - * @param currentSegment + * @param currentChunk * * @throws IOException */ - private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException { + private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException { int charsRead = 0; boolean whitespaceFound = false; //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader. @@ -262,9 +270,7 @@ class Chunker implements Iterator, Iterable { if (Character.isHighSurrogate(ch)) { charsRead = reader.read(tempChunkBuf, 1, 1); if (charsRead == -1) { - //this is the last chunk, so include the unpaired surrogate - currentSegment.append(ch); - chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length; + //this is the last chunk, so drop the unpaired surrogate endOfReaderReached = true; return; } else { @@ -275,11 +281,12 @@ class Chunker implements Iterator, Iterable { //one char chunkSegment = new String(tempChunkBuf, 0, 1); } + chunkSegment = stripInvalidUTF16(chunkSegment); //check for whitespace. whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0)); //add read chars to the chunk and update the length. - currentSegment.append(chunkSegment); - chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length; + currentChunk.append(chunkSegment); + chunkSizeBytes += chunkSegment.getBytes(UTF_8).length; } } } @@ -291,20 +298,41 @@ class Chunker implements Iterator, Iterable { static class Chunk { private final StringBuilder sb; - private final int chunksize; + private final int baseChunkSizeChars; + private final int chunkSizeBytes; - Chunk(StringBuilder sb, int baseChunkLength) { + Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) { this.sb = sb; - this.chunksize = baseChunkLength; + this.baseChunkSizeChars = baseChunkSizeChars; + this.chunkSizeBytes = chunkSizeBytes; } + /** + * Get the content of the chunk. + * + * @return The content of the chunk. + */ @Override public String toString() { return sb.toString(); } + /** + * Get the size in bytes of the utf-8 encoding of the entire chunk. + * + * @return the size in bytes of the utf-8 encoding of the entire chunk + */ + public int getChunkSizeBytes() { + return chunkSizeBytes; + } + + /** + * Get the length of the base chunk in java chars. + * + * @return the length of the base chunk in java chars. + */ int getBaseChunkLength() { - return chunksize; + return baseChunkSizeChars; } } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index d5c68b6944..45c1dad4f6 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -178,6 +178,7 @@ class Ingester { //after all chunks, index just the meta data, including the numChunks, of the parent file fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id + fields.remove(Server.Schema.CHUNK_SIZE.toString()); indexChunk(null, sourceName, fields); }