From b0c66538ec2f64d0a38bd09a38e45a1bd382cc99 Mon Sep 17 00:00:00 2001 From: Eugene Livis Date: Mon, 2 Nov 2020 12:21:08 -0500 Subject: [PATCH] Fixed a bug in 32k chunking --- .../autopsy/keywordsearch/Chunker.java | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index 9c9923181d..fd0f11ab74 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -185,9 +185,8 @@ class Chunker implements Iterator, Iterable { } private static StringBuilder sanitize(String s) { - String normStr = Normalizer.normalize(s, Normalizer.Form.NFC); + String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC); return sanitizeToUTF8(replaceInvalidUTF16(normStr)); - } @Override @@ -336,8 +335,9 @@ class Chunker implements Iterator, Iterable { String chunkSegment; if (Character.isHighSurrogate(ch)) { //read another char into the buffer. - charsRead = reader.read(tempChunkBuf, 1, 1); - if (charsRead == -1) { + int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1); + charsRead += surrogateCharsRead; + if (surrogateCharsRead == -1) { //this is the last chunk, so just drop the unpaired surrogate endOfReaderReached = true; return; @@ -352,17 +352,32 @@ class Chunker implements Iterator, Iterable { //cleanup any invalid utf-16 sequences StringBuilder sanitizedChunkSegment = sanitize(chunkSegment); - //check for whitespace. - whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0)); - //add read chars to the chunk and update the length. - currentChunk.append(sanitizedChunkSegment); - chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length; - + //get the length in utf8 bytes of the read chars + int segmentSize = chunkSegment.getBytes(UTF_8).length; + // lower case the string and get it's size. NOTE: lower casing can // change the size of the string. String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase(); - lowerCasedChunk.append(lowerCasedSegment); - lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length; + int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length; + + //if it will not put us past maxBytes + if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES) + && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) { + + //add read chars to the chunk and update the length. + currentChunk.append(sanitizedChunkSegment); + chunkSizeBytes += segmentSize; + + lowerCasedChunk.append(lowerCasedSegment); + lowerCasedChunkSizeBytes += lowerCasedSegmentSize; + + //check for whitespace. + whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0)); + } else { + //unread it, and break out of read loop. + reader.unread(tempChunkBuf, 0, charsRead); + return; + } } } }