Bug fix in KWS chunking

This commit is contained in:
Eugene Livis 2019-10-03 12:48:44 -04:00
parent c36620e876
commit 939d2dc4e3
2 changed files with 103 additions and 53 deletions

View File

@ -45,33 +45,69 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
private static final Charset UTF_8 = StandardCharsets.UTF_8; private static final Charset UTF_8 = StandardCharsets.UTF_8;
//Chunking algorithm paramaters-------------------------------------// //Chunking algorithm paramaters-------------------------------------//
/** the maximum size of a chunk, including the window. */ /**
* the maximum size of a chunk, including the window.
*/
private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
/** the minimum to read before we start the process of looking for /**
* whitespace to break at and creating an overlapping window. */ * the minimum to read before we start the process of looking for whitespace
* to break at and creating an overlapping window.
*/
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
/** The maximum size of the chunk, before the overlapping window, even if we /**
* couldn't find whitespace to break at. */ * The maximum size of the chunk, before the overlapping window, even if we
* couldn't find whitespace to break at.
*/
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
/** The amount of text we will read through before we give up on finding /**
* whitespace to break the chunk/window at. */ * The amount of text we will read through before we give up on finding
* whitespace to break the chunk/window at.
*/
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
/** The number of characters to read in one go from the Reader. */ /**
* The number of characters to read in one go from the Reader.
*/
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
/**
* When toLowerCase() is called on a character, the lower cased output
* can be different in size than the original input. I have seen a single
* input character turn into 3 characters (and 5 bytes) after lowercasing.
* I could not find any info as to what is the upper limit of how much a
* character can "increase in size" during lower casing. I'm guestimating
* and setting that limit at 10 bytes.
*/
private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes
////chunker state--------------------------------------------/// ////chunker state--------------------------------------------///
/** The Reader that this chunk reads from, and divides into chunks. It must /**
* be a buffered reader to ensure that mark/reset are supported. */ * The Reader that this chunk reads from, and divides into chunks. It must
* be a buffered reader to ensure that mark/reset are supported.
*/
private final PushbackReader reader; private final PushbackReader reader;
/** The local buffer of characters read from the Reader. */ /**
* The local buffer of characters read from the Reader.
*/
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE]; private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
/** the size in bytes of the chunk (so far). */ /**
* the size in bytes of the chunk (so far).
*/
private int chunkSizeBytes = 0; private int chunkSizeBytes = 0;
/** Has the chunker reached the end of the Reader? If so, there are no more
* chunks, and the current chunk does not need a window. */ /**
* the size in bytes of the lowercased chunk (so far). Note that lowercasing
* in Java can change the size of the string so we need to make sure the
* lowercased string also fits in MAX_TOTAL_CHUNK_SIZE.
*/
private int lowerCasedChunkSizeBytes = 0;
/**
* Has the chunker reached the end of the Reader? If so, there are no more
* chunks, and the current chunk does not need a window.
*/
private boolean endOfReaderReached = false; private boolean endOfReaderReached = false;
/** Store any exception encountered reading from the Reader. */ /**
* Store any exception encountered reading from the Reader.
*/
private Exception ex; private Exception ex;
/** /**
@ -140,7 +176,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
* @param s The string to cleanup. * @param s The string to cleanup.
* *
* @return A StringBuilder with the same content as s but where all invalid * @return A StringBuilder with the same content as s but where all invalid
* code * points have been replaced. * code * points have been replaced.
*/ */
private static StringBuilder replaceInvalidUTF16(String s) { private static StringBuilder replaceInvalidUTF16(String s) {
/* encode the string to UTF-16 which does the replcement, see /* encode the string to UTF-16 which does the replcement, see
@ -162,16 +198,18 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
//reset state for the next chunk //reset state for the next chunk
chunkSizeBytes = 0; chunkSizeBytes = 0;
lowerCasedChunkSizeBytes = 0;
int baseChunkSizeChars = 0; int baseChunkSizeChars = 0;
StringBuilder currentChunk = new StringBuilder(); StringBuilder currentChunk = new StringBuilder();
StringBuilder currentWindow = new StringBuilder(); StringBuilder currentWindow = new StringBuilder();
StringBuilder lowerCasedChunk = new StringBuilder();
try { try {
currentChunk.append(readBaseChunk()); readBaseChunk(currentChunk, lowerCasedChunk);
baseChunkSizeChars = currentChunk.length(); //save the base chunk length baseChunkSizeChars = currentChunk.length(); //save the base chunk length
currentWindow.append(readWindow()); readWindow(currentWindow, lowerCasedChunk);
//add the window text to the current chunk. //add the window text to the current chunk.
currentChunk.append(currentWindow); currentChunk.append(currentWindow);
if (endOfReaderReached) { if (endOfReaderReached) {
/* if we have reached the end of the content,we won't make /* if we have reached the end of the content,we won't make
* another overlapping chunk, so the length of the base chunk * another overlapping chunk, so the length of the base chunk
@ -188,7 +226,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
} }
//sanitize the text and return a Chunk object, that includes the base chunk length. //sanitize the text and return a Chunk object, that includes the base chunk length.
return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes); return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
} }
/** /**
@ -196,14 +234,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
* *
* @throws IOException if there is a problem reading from the reader. * @throws IOException if there is a problem reading from the reader.
*/ */
private StringBuilder readBaseChunk() throws IOException { private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
StringBuilder currentChunk = new StringBuilder();
//read the chunk until the minimum base chunk size //read the chunk until the minimum base chunk size
readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk); readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
//keep reading until the maximum base chunk size or white space is reached. //keep reading until the maximum base chunk size or white space is reached.
readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk); readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
return currentChunk;
} }
/** /**
@ -211,14 +247,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
* *
* @throws IOException if there is a problem reading from the reader. * @throws IOException if there is a problem reading from the reader.
*/ */
private StringBuilder readWindow() throws IOException { private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
StringBuilder currentWindow = new StringBuilder();
//read the window, leaving some room to look for white space to break at. //read the window, leaving some room to look for white space to break at.
readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow); readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
//keep reading until the max chunk size, or until whitespace is reached. //keep reading until the max chunk size, or until whitespace is reached.
readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow); readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
return currentWindow;
} }
/** /**
@ -229,10 +263,10 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
* *
* @throws IOException * @throws IOException
*/ */
private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException { private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException {
int charsRead = 0; int charsRead = 0;
//read chars up to maxBytes, or the end of the reader. //read chars up to maxBytes, or the end of the reader.
while ((chunkSizeBytes < maxBytes) while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
&& (endOfReaderReached == false)) { && (endOfReaderReached == false)) {
charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE); charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
if (-1 == charsRead) { if (-1 == charsRead) {
@ -253,11 +287,19 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
//get the length in utf8 bytes of the read chars //get the length in utf8 bytes of the read chars
int segmentSize = chunkSegment.toString().getBytes(UTF_8).length; int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
// lower case the string and get it's size. NOTE: lower casing can
// change the size of the string!
String lowerCasedSegment = chunkSegment.toString().toLowerCase();
int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
//if it will not put us past maxBytes //if it will not put us past maxBytes
if (chunkSizeBytes + segmentSize < maxBytes) { if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
//add it to the chunk //add it to the chunk
currentSegment.append(chunkSegment); currentSegment.append(chunkSegment);
chunkSizeBytes += segmentSize; chunkSizeBytes += segmentSize;
currentLowerCasedSegment.append(lowerCasedSegment);
lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
} else { } else {
//unread it, and break out of read loop. //unread it, and break out of read loop.
reader.unread(tempChunkBuf, 0, charsRead); reader.unread(tempChunkBuf, 0, charsRead);
@ -275,11 +317,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
* *
* @throws IOException * @throws IOException
*/ */
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException { private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
int charsRead = 0; int charsRead = 0;
boolean whitespaceFound = false; boolean whitespaceFound = false;
//read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader. //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
while ((chunkSizeBytes < maxBytes) while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
&& (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
&& (whitespaceFound == false) && (whitespaceFound == false)
&& (endOfReaderReached == false)) { && (endOfReaderReached == false)) {
charsRead = reader.read(tempChunkBuf, 0, 1); charsRead = reader.read(tempChunkBuf, 0, 1);
@ -314,6 +357,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
//add read chars to the chunk and update the length. //add read chars to the chunk and update the length.
currentChunk.append(sanitizedChunkSegment); currentChunk.append(sanitizedChunkSegment);
chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length; chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
// lower case the string and get it's size. NOTE: lower casing can
// change the size of the string.
String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
lowerCasedChunk.append(lowerCasedSegment);
lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
} }
} }
} }
@ -326,16 +375,16 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
private final StringBuilder sb; private final StringBuilder sb;
private final int baseChunkSizeChars; private final int baseChunkSizeChars;
private final int chunkSizeBytes; private final StringBuilder lowerCasedChunk;
Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) { Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
this.sb = sb; this.sb = sb;
this.baseChunkSizeChars = baseChunkSizeChars; this.baseChunkSizeChars = baseChunkSizeChars;
this.chunkSizeBytes = chunkSizeBytes; this.lowerCasedChunk = lowerCasedChunk;
} }
/** /**
* Get the content of the chunk. * Get the content of the original (non-lower cased) chunk.
* *
* @return The content of the chunk. * @return The content of the chunk.
*/ */
@ -345,16 +394,16 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
} }
/** /**
* Get the size in bytes of the utf-8 encoding of the entire chunk. * Get the content of the lower cased chunk.
* *
* @return the size in bytes of the utf-8 encoding of the entire chunk * @return The content of the chunk.
*/ */
public int getChunkSizeBytes() { public String geLowerCasedChunk() {
return chunkSizeBytes; return lowerCasedChunk.toString();
} }
/** /**
* Get the length of the base chunk in java chars. * Get the length of the original (non-lower cased) base chunk in java chars.
* *
* @return the length of the base chunk in java chars. * @return the length of the base chunk in java chars.
*/ */

View File

@ -97,7 +97,7 @@ class Ingester {
* file, but the Solr server is probably fine. * file, but the Solr server is probably fine.
*/ */
void indexMetaDataOnly(AbstractFile file) throws IngesterException { void indexMetaDataOnly(AbstractFile file) throws IngesterException {
indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file))); indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
} }
/** /**
@ -111,7 +111,7 @@ class Ingester {
* artifact, but the Solr server is probably fine. * artifact, but the Solr server is probably fine.
*/ */
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException { void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
indexChunk("", sourceName, new HashMap<>(getContentFields(artifact))); indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));
} }
/** /**
@ -166,7 +166,7 @@ class Ingester {
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang)); language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
try { try {
//add the chunk text to Solr index //add the chunk text to Solr index
indexChunk(chunk.toString(), sourceName, fields); indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields);
// add mini chunk when there's a language specific field // add mini chunk when there's a language specific field
if (chunker.hasNext() && language.isPresent()) { if (chunker.hasNext() && language.isPresent()) {
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get()); languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
@ -197,7 +197,7 @@ class Ingester {
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
//"parent" docs don't have chunk_size //"parent" docs don't have chunk_size
fields.remove(Server.Schema.CHUNK_SIZE.toString()); fields.remove(Server.Schema.CHUNK_SIZE.toString());
indexChunk(null, sourceName, fields); indexChunk(null, null, sourceName, fields);
} }
} }
return true; return true;
@ -211,12 +211,13 @@ class Ingester {
* 4.0.0), see if possible to stream with UpdateRequestHandler * 4.0.0), see if possible to stream with UpdateRequestHandler
* *
* @param chunk The chunk content as a string, or null for metadata only * @param chunk The chunk content as a string, or null for metadata only
* @param lowerCasedChunk The lower cased chunk content as a string, or null for metadata only
* @param fields * @param fields
* @param size * @param size
* *
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/ */
private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException { private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
//JMTODO: actually if the we couldn't get the image id it is set to -1, //JMTODO: actually if the we couldn't get the image id it is set to -1,
// but does this really mean we don't want to index it? // but does this really mean we don't want to index it?
@ -245,7 +246,7 @@ class Ingester {
// insensitive substring/regular expression search. // insensitive substring/regular expression search.
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion()); double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
if (indexSchemaVersion >= 2.1) { if (indexSchemaVersion >= 2.1) {
updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : chunk.toLowerCase())); updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));
} }
TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk"); TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");