mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
Fixed a bug in 32k chunking
This commit is contained in:
parent
7756f5c7b7
commit
b0c66538ec
@ -185,9 +185,8 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
}
|
||||
|
||||
private static StringBuilder sanitize(String s) {
|
||||
String normStr = Normalizer.normalize(s, Normalizer.Form.NFC);
|
||||
String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
|
||||
return sanitizeToUTF8(replaceInvalidUTF16(normStr));
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -336,8 +335,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
String chunkSegment;
|
||||
if (Character.isHighSurrogate(ch)) {
|
||||
//read another char into the buffer.
|
||||
charsRead = reader.read(tempChunkBuf, 1, 1);
|
||||
if (charsRead == -1) {
|
||||
int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
|
||||
charsRead += surrogateCharsRead;
|
||||
if (surrogateCharsRead == -1) {
|
||||
//this is the last chunk, so just drop the unpaired surrogate
|
||||
endOfReaderReached = true;
|
||||
return;
|
||||
@ -352,17 +352,32 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
|
||||
//cleanup any invalid utf-16 sequences
|
||||
StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
|
||||
//check for whitespace.
|
||||
whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
|
||||
//add read chars to the chunk and update the length.
|
||||
currentChunk.append(sanitizedChunkSegment);
|
||||
chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
|
||||
//get the length in utf8 bytes of the read chars
|
||||
int segmentSize = chunkSegment.getBytes(UTF_8).length;
|
||||
|
||||
// lower case the string and get it's size. NOTE: lower casing can
|
||||
// change the size of the string.
|
||||
String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
|
||||
int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
|
||||
|
||||
//if it will not put us past maxBytes
|
||||
if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
|
||||
&& (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
|
||||
|
||||
//add read chars to the chunk and update the length.
|
||||
currentChunk.append(sanitizedChunkSegment);
|
||||
chunkSizeBytes += segmentSize;
|
||||
|
||||
lowerCasedChunk.append(lowerCasedSegment);
|
||||
lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
|
||||
lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
|
||||
|
||||
//check for whitespace.
|
||||
whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
|
||||
} else {
|
||||
//unread it, and break out of read loop.
|
||||
reader.unread(tempChunkBuf, 0, charsRead);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user