Fixed a bug in 32k chunking

This commit is contained in:
Eugene Livis 2020-11-02 12:21:08 -05:00
parent 7756f5c7b7
commit b0c66538ec

View File

@ -185,9 +185,8 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
}
private static StringBuilder sanitize(String s) {
String normStr = Normalizer.normalize(s, Normalizer.Form.NFC);
String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
return sanitizeToUTF8(replaceInvalidUTF16(normStr));
}
@Override
@ -336,8 +335,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
String chunkSegment;
if (Character.isHighSurrogate(ch)) {
//read another char into the buffer.
charsRead = reader.read(tempChunkBuf, 1, 1);
if (charsRead == -1) {
int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
charsRead += surrogateCharsRead;
if (surrogateCharsRead == -1) {
//this is the last chunk, so just drop the unpaired surrogate
endOfReaderReached = true;
return;
@ -352,17 +352,32 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
//cleanup any invalid utf-16 sequences
StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
//check for whitespace.
whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
//add read chars to the chunk and update the length.
currentChunk.append(sanitizedChunkSegment);
chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
//get the length in utf8 bytes of the read chars
int segmentSize = chunkSegment.getBytes(UTF_8).length;
// lower case the string and get it's size. NOTE: lower casing can
// change the size of the string.
String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
//if it will not put us past maxBytes
if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
&& (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
//add read chars to the chunk and update the length.
currentChunk.append(sanitizedChunkSegment);
chunkSizeBytes += segmentSize;
lowerCasedChunk.append(lowerCasedSegment);
lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
//check for whitespace.
whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
} else {
//unread it, and break out of read loop.
reader.unread(tempChunkBuf, 0, charsRead);
return;
}
}
}
}