diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index d2967ce400..e3743e7def 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2016 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.PushbackReader; import java.io.Reader; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.NoSuchElementException; @@ -38,6 +39,10 @@ import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; @NotThreadSafe class Chunker implements Iterator, Iterable { + //local references to standard encodings + private static final Charset UTF_16 = StandardCharsets.UTF_16; + private static final Charset UTF_8 = StandardCharsets.UTF_8; + //Chunking algorithm paramaters-------------------------------------// /** the maximum size of a chunk, including the window. */ private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes @@ -84,7 +89,10 @@ class Chunker implements Iterator, Iterable { } /** - * Has this Chunker encountered an exception reading from the Reader. + * Has this Chunker encountered an exception reading from the Reader? + * + * + * @return True if this Chunker encountered an exception. */ boolean hasException() { return ex != null; @@ -124,6 +132,25 @@ class Chunker implements Iterator, Iterable { return sb; } + /** + * Cleanup invalid codepoint sequences by replacing them with the default + * replacement character: U+FFFD / �. + * + * @param s The string to cleanup. + * + * @return A StringBuilder with the same content as s but where all invalid + * code * points have been replaced. + */ + private static StringBuilder replaceInvalidUTF16(String s) { + /* encode the string to UTF-16 which does the replcement, see + * Charset.encode(), then decode back to a StringBuilder. */ + return new StringBuilder(UTF_16.decode(UTF_16.encode(s))); + } + + private static StringBuilder sanitize(String s) { + return sanitizeToUTF8(replaceInvalidUTF16(s)); + } + @Override public Chunk next() { if (hasNext() == false) { @@ -157,7 +184,7 @@ class Chunker implements Iterator, Iterable { //add the window text to the current chunk. currentChunk.append(currentWindow); //sanitize the text and return a Chunk object, that includes the base chunk length. - return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); + return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes); } /** @@ -216,10 +243,11 @@ class Chunker implements Iterator, Iterable { reader.unread(lastChar); } - String chunkSegment = new String(tempChunkBuf, 0, charsRead); + //cleanup any invalid utf-16 sequences + StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead)); - //get the length in bytes of the read chars - int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length; + //get the length in utf8 bytes of the read chars + int segmentSize = chunkSegment.toString().getBytes(UTF_8).length; //if it will not put us past maxBytes if (chunkSizeBytes + segmentSize < maxBytes) { @@ -239,11 +267,11 @@ class Chunker implements Iterator, Iterable { * Read until the maxBytes reached, whitespace, or end of reader. * * @param maxBytes - * @param currentSegment + * @param currentChunk * * @throws IOException */ - private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException { + private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException { int charsRead = 0; boolean whitespaceFound = false; //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader. @@ -260,26 +288,28 @@ class Chunker implements Iterator, Iterable { final char ch = tempChunkBuf[0]; String chunkSegment; if (Character.isHighSurrogate(ch)) { + //read another char into the buffer. charsRead = reader.read(tempChunkBuf, 1, 1); if (charsRead == -1) { - //this is the last chunk, so include the unpaired surrogate - currentSegment.append(ch); - chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length; + //this is the last chunk, so just drop the unpaired surrogate endOfReaderReached = true; return; } else { - //use the surrogate pair in place of the unpaired surrogate. + //try to use the pair together. chunkSegment = new String(tempChunkBuf, 0, 2); } } else { //one char chunkSegment = new String(tempChunkBuf, 0, 1); } + + //cleanup any invalid utf-16 sequences + StringBuilder sanitizedChunkSegment = sanitize(chunkSegment); //check for whitespace. - whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0)); + whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0)); //add read chars to the chunk and update the length. - currentSegment.append(chunkSegment); - chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length; + currentChunk.append(sanitizedChunkSegment); + chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length; } } } @@ -291,20 +321,41 @@ class Chunker implements Iterator, Iterable { static class Chunk { private final StringBuilder sb; - private final int chunksize; + private final int baseChunkSizeChars; + private final int chunkSizeBytes; - Chunk(StringBuilder sb, int baseChunkLength) { + Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) { this.sb = sb; - this.chunksize = baseChunkLength; + this.baseChunkSizeChars = baseChunkSizeChars; + this.chunkSizeBytes = chunkSizeBytes; } + /** + * Get the content of the chunk. + * + * @return The content of the chunk. + */ @Override public String toString() { return sb.toString(); } + /** + * Get the size in bytes of the utf-8 encoding of the entire chunk. + * + * @return the size in bytes of the utf-8 encoding of the entire chunk + */ + public int getChunkSizeBytes() { + return chunkSizeBytes; + } + + /** + * Get the length of the base chunk in java chars. + * + * @return the length of the base chunk in java chars. + */ int getBaseChunkLength() { - return chunksize; + return baseChunkSizeChars; } } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index d5c68b6944..9379898051 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2016 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -177,7 +177,10 @@ class Ingester { } finally { //after all chunks, index just the meta data, including the numChunks, of the parent file fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); - fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id + //reset id field to base document id + fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); + //"parent" docs don't have chunk_size + fields.remove(Server.Schema.CHUNK_SIZE.toString()); indexChunk(null, sourceName, fields); }