mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-16 17:57:43 +00:00
first pass at sanitizing the utf-16 as we read it.
This commit is contained in:
parent
3f92ff0411
commit
5482fb4979
@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch;
|
||||
import java.io.IOException;
|
||||
import java.io.PushbackReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
@ -38,6 +39,9 @@ import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
|
||||
@NotThreadSafe
|
||||
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
|
||||
private static final Charset UTF_16 = StandardCharsets.UTF_16;
|
||||
private static final Charset UTF_8 = StandardCharsets.UTF_8;
|
||||
|
||||
//Chunking algorithm paramaters-------------------------------------//
|
||||
/** the maximum size of a chunk, including the window. */
|
||||
private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
|
||||
@ -157,7 +161,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
//add the window text to the current chunk.
|
||||
currentChunk.append(currentWindow);
|
||||
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
||||
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
|
||||
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars, chunkSizeBytes);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -216,10 +220,10 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
reader.unread(lastChar);
|
||||
}
|
||||
|
||||
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
||||
String chunkSegment = stripInvalidUTF16(new String(tempChunkBuf, 0, charsRead));
|
||||
|
||||
//get the length in bytes of the read chars
|
||||
int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length;
|
||||
int segmentSize = chunkSegment.getBytes(UTF_8).length;
|
||||
|
||||
//if it will not put us past maxBytes
|
||||
if (chunkSizeBytes + segmentSize < maxBytes) {
|
||||
@ -235,15 +239,19 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
}
|
||||
}
|
||||
|
||||
private static String stripInvalidUTF16(String chunkSegment) {
|
||||
return UTF_16.decode(UTF_16.encode(chunkSegment)).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Read until the maxBytes reached, whitespace, or end of reader.
|
||||
*
|
||||
* @param maxBytes
|
||||
* @param currentSegment
|
||||
* @param currentChunk
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
|
||||
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
|
||||
int charsRead = 0;
|
||||
boolean whitespaceFound = false;
|
||||
//read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
|
||||
@ -262,9 +270,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
if (Character.isHighSurrogate(ch)) {
|
||||
charsRead = reader.read(tempChunkBuf, 1, 1);
|
||||
if (charsRead == -1) {
|
||||
//this is the last chunk, so include the unpaired surrogate
|
||||
currentSegment.append(ch);
|
||||
chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length;
|
||||
//this is the last chunk, so drop the unpaired surrogate
|
||||
endOfReaderReached = true;
|
||||
return;
|
||||
} else {
|
||||
@ -275,11 +281,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
//one char
|
||||
chunkSegment = new String(tempChunkBuf, 0, 1);
|
||||
}
|
||||
chunkSegment = stripInvalidUTF16(chunkSegment);
|
||||
//check for whitespace.
|
||||
whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0));
|
||||
//add read chars to the chunk and update the length.
|
||||
currentSegment.append(chunkSegment);
|
||||
chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length;
|
||||
currentChunk.append(chunkSegment);
|
||||
chunkSizeBytes += chunkSegment.getBytes(UTF_8).length;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -291,20 +298,41 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
static class Chunk {
|
||||
|
||||
private final StringBuilder sb;
|
||||
private final int chunksize;
|
||||
private final int baseChunkSizeChars;
|
||||
private final int chunkSizeBytes;
|
||||
|
||||
Chunk(StringBuilder sb, int baseChunkLength) {
|
||||
Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
|
||||
this.sb = sb;
|
||||
this.chunksize = baseChunkLength;
|
||||
this.baseChunkSizeChars = baseChunkSizeChars;
|
||||
this.chunkSizeBytes = chunkSizeBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the content of the chunk.
|
||||
*
|
||||
* @return The content of the chunk.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the size in bytes of the utf-8 encoding of the entire chunk.
|
||||
*
|
||||
* @return the size in bytes of the utf-8 encoding of the entire chunk
|
||||
*/
|
||||
public int getChunkSizeBytes() {
|
||||
return chunkSizeBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the length of the base chunk in java chars.
|
||||
*
|
||||
* @return the length of the base chunk in java chars.
|
||||
*/
|
||||
int getBaseChunkLength() {
|
||||
return chunksize;
|
||||
return baseChunkSizeChars;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -178,6 +178,7 @@ class Ingester {
|
||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
|
||||
fields.remove(Server.Schema.CHUNK_SIZE.toString());
|
||||
indexChunk(null, sourceName, fields);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user