mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-16 17:57:43 +00:00
cleanup and comments in Chunker
This commit is contained in:
parent
990433fc36
commit
161ba2098c
@ -374,30 +374,51 @@ class Ingester {
|
||||
}
|
||||
|
||||
/**
|
||||
* Encapsulates the content chunking algorithm in implementation of the Iterator
|
||||
* interface.
|
||||
* Encapsulates the content chunking algorithm in an implementation of the
|
||||
* Iterator interface. Also implements Iterable so it can be used directly in a
|
||||
* for loop. The base chunk is the part of the chunk before the overlapping
|
||||
* window. The window will be included at the end of the current chunk as well
|
||||
* as at the beginning of the next chunk.
|
||||
*/
|
||||
@NotThreadSafe
|
||||
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
|
||||
//Chunking algorithm paramaters-------------------------------------//
|
||||
/** the maximum size of a chunk, including the window. */
|
||||
private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
|
||||
/** the minimum to read before we start the process of looking for
|
||||
* whitespace to break at and creating an overlapping window. */
|
||||
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
|
||||
/** The maximum size of the chunk, before the overlapping window, even if we
|
||||
* couldn't find whitespace to break at. */
|
||||
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
|
||||
/** The amount of text we will read through before we give up on finding
|
||||
* whitespace to break the chunk/window at. */
|
||||
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
|
||||
private static final int INITIAL_WINDOW_END = MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE;
|
||||
/** The number of characters to read in one go from the Reader. */
|
||||
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
|
||||
private final BufferedReader reader;
|
||||
|
||||
////chunker state--------------------------------------------///
|
||||
/** The Reader that this chunk reads from, and divides into chunks. It must
|
||||
* be a buffered reader to ensure that mark/reset are supported. */
|
||||
private final BufferedReader reader;
|
||||
/** The local buffer of characters read from the Reader. */
|
||||
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
|
||||
private int charsRead = 0; // number of chars read in the most recent read operation
|
||||
/** number of chars read in the most recent read operation. */
|
||||
private int charsRead = 0;
|
||||
|
||||
/** The text of the current chunk (so far). */
|
||||
private StringBuilder currentChunk;
|
||||
private int chunkSizeBytes = 0; // the size in bytes of the base chunk (so far)
|
||||
private int chunkSizeChars;// the size in chars of the base chunk (so far)
|
||||
/** the size in bytes of the chunk (so far). */
|
||||
private int chunkSizeBytes = 0;
|
||||
/** the size in chars of the (base) chunk (so far). */
|
||||
private int baseChunkSizeChars;
|
||||
|
||||
private boolean whitespace = false;
|
||||
private boolean endOfContent = false;
|
||||
/** has the chunker found whitespace to break on? */
|
||||
private boolean whitespaceFound = false;
|
||||
/** has the chunker reached the end of the Reader? If so, there are no more
|
||||
* chunks, and the current chunk does not need a window. */
|
||||
private boolean endOfReaderReached = false;
|
||||
|
||||
/**
|
||||
* Create a Chunker that will chunk the content of the given Reader.
|
||||
@ -413,15 +434,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are there any more chunks available from this chunker?
|
||||
*
|
||||
*
|
||||
* @return true if there are more chunks available.
|
||||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return endOfContent == false;
|
||||
return endOfReaderReached == false;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -435,12 +450,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
*/
|
||||
private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
|
||||
final int length = sb.length();
|
||||
|
||||
// Sanitize by replacing non-UTF-8 characters with caret '^'
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
||||
sb.replace(i, i + 1, "^");
|
||||
|
||||
}
|
||||
}
|
||||
return sb;
|
||||
@ -448,77 +460,107 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
|
||||
@Override
|
||||
public Chunk next() {
|
||||
if (hasNext()) {
|
||||
currentChunk = new StringBuilder();
|
||||
chunkSizeBytes = 0;
|
||||
|
||||
try {
|
||||
readBaseChunk();
|
||||
reader.mark(2048);
|
||||
readWindow();
|
||||
} catch (IOException ioEx) {
|
||||
throw new RuntimeException("IOException while attempting to read chunk.", ioEx);
|
||||
}
|
||||
try {
|
||||
reader.reset();
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("IOException while attempting to reset chunk reader.", ex);
|
||||
}
|
||||
|
||||
if (endOfContent) {
|
||||
//if we have reached the end of the content, don't make another overlapping chunk.
|
||||
chunkSizeChars = currentChunk.length();
|
||||
}
|
||||
return new Chunk(sanitizeToUTF8(currentChunk), chunkSizeChars);
|
||||
} else {
|
||||
if (endOfReaderReached) {
|
||||
throw new NoSuchElementException("There are no more chunks.");
|
||||
}
|
||||
//reset state for the next chunk
|
||||
currentChunk = new StringBuilder();
|
||||
chunkSizeBytes = 0;
|
||||
baseChunkSizeChars = 0;
|
||||
|
||||
try {
|
||||
readBaseChunk();
|
||||
baseChunkSizeChars = currentChunk.length();
|
||||
reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk
|
||||
readWindow();
|
||||
} catch (IOException ioEx) {
|
||||
throw new RuntimeException("IOException while reading chunk.", ioEx);
|
||||
}
|
||||
try {
|
||||
reader.reset(); //reset the reader the so the next chunk can begin at the position marked above
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("IOException while resetting chunk reader.", ex);
|
||||
}
|
||||
|
||||
if (endOfReaderReached) {
|
||||
/* if we have reached the end of the content,we won't make another
|
||||
* overlapping chunk, so the base chunk can be extended to the end. */
|
||||
baseChunkSizeChars = currentChunk.length();
|
||||
}
|
||||
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
||||
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the base chunk from the reader, and attempt to break at whitespace.
|
||||
*
|
||||
* @throws IOException if there is a problem reading from the reader.
|
||||
*/
|
||||
private void readBaseChunk() throws IOException {
|
||||
//read the chunk until the minimum size
|
||||
readHelper(MINIMUM_BASE_CHUNK_SIZE, true, false);
|
||||
//keep reading until the maximum or white space is reached.
|
||||
whitespace = false;
|
||||
readHelper(MAXIMUM_BASE_CHUNK_SIZE, true, true);
|
||||
//read the chunk until the minimum base chunk size
|
||||
readHelper(MINIMUM_BASE_CHUNK_SIZE, false);
|
||||
//keep reading until the maximum base chunk size or white space is reached.
|
||||
whitespaceFound = false;
|
||||
readHelper(MAXIMUM_BASE_CHUNK_SIZE, true);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the window from the reader, and attempt to break at whitespace.
|
||||
*
|
||||
* @throws IOException if there is a problem reading from the reader.
|
||||
*/
|
||||
private void readWindow() throws IOException {
|
||||
//read the window until
|
||||
int WINDOW_END = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
|
||||
readHelper(WINDOW_END, false, false);
|
||||
whitespace = false;
|
||||
readHelper(MAX_TOTAL_CHUNK_SIZE, false, true);
|
||||
//read the window, leaving some room to look for white space to break at.
|
||||
int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
|
||||
readHelper(windowEnd, false);
|
||||
whitespaceFound = false;
|
||||
//keep reading until the max chunk size, or until whitespace is reached.
|
||||
windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024);
|
||||
readHelper(windowEnd, true);
|
||||
}
|
||||
|
||||
private void readHelper(int maxBytes, boolean inBaseChunk, boolean inWhiteSpaceBuffer) throws IOException {
|
||||
/** Helper method that implements reading in a loop.
|
||||
*
|
||||
* @param maxBytes The max cummulative length of the content,in
|
||||
* bytes, to read from the Reader. That is, when
|
||||
* chunkSizeBytes >= maxBytes stop reading.
|
||||
* @param inWhiteSpaceBuffer Should the current read stop once whitespace is
|
||||
* found?
|
||||
*
|
||||
* @throws IOException If there is a problem reading from the Reader.
|
||||
*/
|
||||
private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException {
|
||||
//only read one character at a time if we are looking for whitespace.
|
||||
final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE;
|
||||
|
||||
//read chars up to initial chunk size
|
||||
while (chunkSizeBytes < maxBytes
|
||||
&& (false == (inWhiteSpaceBuffer && whitespace))
|
||||
&& endOfContent == false) {
|
||||
//read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader.
|
||||
while ((chunkSizeBytes < maxBytes)
|
||||
&& (false == (inWhiteSpaceBuffer && whitespaceFound))
|
||||
&& (endOfReaderReached == false)) {
|
||||
charsRead = reader.read(tempChunkBuf, 0, readSize);
|
||||
if (-1 == charsRead) {
|
||||
//this is the last chunk
|
||||
endOfContent = true;
|
||||
endOfReaderReached = true;
|
||||
} else {
|
||||
if (inWhiteSpaceBuffer) {
|
||||
whitespace = Character.isWhitespace(tempChunkBuf[0]);
|
||||
//chec for whitespace.
|
||||
whitespaceFound = Character.isWhitespace(tempChunkBuf[0]);
|
||||
}
|
||||
|
||||
//add read chars to the chunk and update the length.
|
||||
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
||||
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
||||
currentChunk.append(chunkSegment);
|
||||
|
||||
if (inBaseChunk) {
|
||||
chunkSizeChars = chunkSegment.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents one chunk as the text in it and the length of the base chunk, in
|
||||
* chars.
|
||||
*/
|
||||
class Chunk {
|
||||
|
||||
private final StringBuilder sb;
|
||||
|
Loading…
x
Reference in New Issue
Block a user