mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
Fixed a bug in 32k chunking
This commit is contained in:
parent
7756f5c7b7
commit
b0c66538ec
@ -185,9 +185,8 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static StringBuilder sanitize(String s) {
|
private static StringBuilder sanitize(String s) {
|
||||||
String normStr = Normalizer.normalize(s, Normalizer.Form.NFC);
|
String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
|
||||||
return sanitizeToUTF8(replaceInvalidUTF16(normStr));
|
return sanitizeToUTF8(replaceInvalidUTF16(normStr));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -336,8 +335,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
String chunkSegment;
|
String chunkSegment;
|
||||||
if (Character.isHighSurrogate(ch)) {
|
if (Character.isHighSurrogate(ch)) {
|
||||||
//read another char into the buffer.
|
//read another char into the buffer.
|
||||||
charsRead = reader.read(tempChunkBuf, 1, 1);
|
int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
|
||||||
if (charsRead == -1) {
|
charsRead += surrogateCharsRead;
|
||||||
|
if (surrogateCharsRead == -1) {
|
||||||
//this is the last chunk, so just drop the unpaired surrogate
|
//this is the last chunk, so just drop the unpaired surrogate
|
||||||
endOfReaderReached = true;
|
endOfReaderReached = true;
|
||||||
return;
|
return;
|
||||||
@ -352,17 +352,32 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
|
|
||||||
//cleanup any invalid utf-16 sequences
|
//cleanup any invalid utf-16 sequences
|
||||||
StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
|
StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
|
||||||
//check for whitespace.
|
//get the length in utf8 bytes of the read chars
|
||||||
whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
|
int segmentSize = chunkSegment.getBytes(UTF_8).length;
|
||||||
//add read chars to the chunk and update the length.
|
|
||||||
currentChunk.append(sanitizedChunkSegment);
|
|
||||||
chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
|
|
||||||
|
|
||||||
// lower case the string and get it's size. NOTE: lower casing can
|
// lower case the string and get it's size. NOTE: lower casing can
|
||||||
// change the size of the string.
|
// change the size of the string.
|
||||||
String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
|
String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
|
||||||
lowerCasedChunk.append(lowerCasedSegment);
|
int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
|
||||||
lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
|
|
||||||
|
//if it will not put us past maxBytes
|
||||||
|
if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
|
||||||
|
&& (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
|
||||||
|
|
||||||
|
//add read chars to the chunk and update the length.
|
||||||
|
currentChunk.append(sanitizedChunkSegment);
|
||||||
|
chunkSizeBytes += segmentSize;
|
||||||
|
|
||||||
|
lowerCasedChunk.append(lowerCasedSegment);
|
||||||
|
lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
|
||||||
|
|
||||||
|
//check for whitespace.
|
||||||
|
whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
|
||||||
|
} else {
|
||||||
|
//unread it, and break out of read loop.
|
||||||
|
reader.unread(tempChunkBuf, 0, charsRead);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user