From 939d2dc4e3a37151d691d8364d44f9817556fd64 Mon Sep 17 00:00:00 2001 From: Eugene Livis Date: Thu, 3 Oct 2019 12:48:44 -0400 Subject: [PATCH] Bug fix in KWS chunking --- .../autopsy/keywordsearch/Chunker.java | 141 ++++++++++++------ .../autopsy/keywordsearch/Ingester.java | 15 +- 2 files changed, 103 insertions(+), 53 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index 82494f2f0d..08ca0ab511 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -45,33 +45,69 @@ class Chunker implements Iterator, Iterable { private static final Charset UTF_8 = StandardCharsets.UTF_8; //Chunking algorithm paramaters-------------------------------------// - /** the maximum size of a chunk, including the window. */ + /** + * the maximum size of a chunk, including the window. + */ private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes - /** the minimum to read before we start the process of looking for - * whitespace to break at and creating an overlapping window. */ + /** + * the minimum to read before we start the process of looking for whitespace + * to break at and creating an overlapping window. + */ private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes - /** The maximum size of the chunk, before the overlapping window, even if we - * couldn't find whitespace to break at. */ + /** + * The maximum size of the chunk, before the overlapping window, even if we + * couldn't find whitespace to break at. + */ private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes - /** The amount of text we will read through before we give up on finding - * whitespace to break the chunk/window at. */ + /** + * The amount of text we will read through before we give up on finding + * whitespace to break the chunk/window at. + */ private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes - /** The number of characters to read in one go from the Reader. */ + /** + * The number of characters to read in one go from the Reader. + */ private static final int READ_CHARS_BUFFER_SIZE = 512; //chars + /** + * When toLowerCase() is called on a character, the lower cased output + * can be different in size than the original input. I have seen a single + * input character turn into 3 characters (and 5 bytes) after lowercasing. + * I could not find any info as to what is the upper limit of how much a + * character can "increase in size" during lower casing. I'm guestimating + * and setting that limit at 10 bytes. + */ + private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes ////chunker state--------------------------------------------/// - /** The Reader that this chunk reads from, and divides into chunks. It must - * be a buffered reader to ensure that mark/reset are supported. */ + /** + * The Reader that this chunk reads from, and divides into chunks. It must + * be a buffered reader to ensure that mark/reset are supported. + */ private final PushbackReader reader; - /** The local buffer of characters read from the Reader. */ + /** + * The local buffer of characters read from the Reader. + */ private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE]; - /** the size in bytes of the chunk (so far). */ + /** + * the size in bytes of the chunk (so far). + */ private int chunkSizeBytes = 0; - /** Has the chunker reached the end of the Reader? If so, there are no more - * chunks, and the current chunk does not need a window. */ + + /** + * the size in bytes of the lowercased chunk (so far). Note that lowercasing + * in Java can change the size of the string so we need to make sure the + * lowercased string also fits in MAX_TOTAL_CHUNK_SIZE. + */ + private int lowerCasedChunkSizeBytes = 0; + /** + * Has the chunker reached the end of the Reader? If so, there are no more + * chunks, and the current chunk does not need a window. + */ private boolean endOfReaderReached = false; - /** Store any exception encountered reading from the Reader. */ + /** + * Store any exception encountered reading from the Reader. + */ private Exception ex; /** @@ -140,7 +176,7 @@ class Chunker implements Iterator, Iterable { * @param s The string to cleanup. * * @return A StringBuilder with the same content as s but where all invalid - * code * points have been replaced. + * code * points have been replaced. */ private static StringBuilder replaceInvalidUTF16(String s) { /* encode the string to UTF-16 which does the replcement, see @@ -162,16 +198,18 @@ class Chunker implements Iterator, Iterable { //reset state for the next chunk chunkSizeBytes = 0; + lowerCasedChunkSizeBytes = 0; int baseChunkSizeChars = 0; StringBuilder currentChunk = new StringBuilder(); StringBuilder currentWindow = new StringBuilder(); + StringBuilder lowerCasedChunk = new StringBuilder(); try { - currentChunk.append(readBaseChunk()); + readBaseChunk(currentChunk, lowerCasedChunk); baseChunkSizeChars = currentChunk.length(); //save the base chunk length - currentWindow.append(readWindow()); - //add the window text to the current chunk. - currentChunk.append(currentWindow); + readWindow(currentWindow, lowerCasedChunk); + //add the window text to the current chunk. + currentChunk.append(currentWindow); if (endOfReaderReached) { /* if we have reached the end of the content,we won't make * another overlapping chunk, so the length of the base chunk @@ -186,9 +224,9 @@ class Chunker implements Iterator, Iterable { * and break any chunking loop in client code. */ ex = ioEx; } - + //sanitize the text and return a Chunk object, that includes the base chunk length. - return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes); + return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk); } /** @@ -196,14 +234,12 @@ class Chunker implements Iterator, Iterable { * * @throws IOException if there is a problem reading from the reader. */ - private StringBuilder readBaseChunk() throws IOException { - StringBuilder currentChunk = new StringBuilder(); + private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException { //read the chunk until the minimum base chunk size - readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk); + readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk); //keep reading until the maximum base chunk size or white space is reached. - readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk); - return currentChunk; + readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk); } /** @@ -211,14 +247,12 @@ class Chunker implements Iterator, Iterable { * * @throws IOException if there is a problem reading from the reader. */ - private StringBuilder readWindow() throws IOException { - StringBuilder currentWindow = new StringBuilder(); + private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException { //read the window, leaving some room to look for white space to break at. - readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow); + readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk); //keep reading until the max chunk size, or until whitespace is reached. - readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow); - return currentWindow; + readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk); } /** @@ -229,10 +263,10 @@ class Chunker implements Iterator, Iterable { * * @throws IOException */ - private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException { + private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException { int charsRead = 0; //read chars up to maxBytes, or the end of the reader. - while ((chunkSizeBytes < maxBytes) + while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes) && (endOfReaderReached == false)) { charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE); if (-1 == charsRead) { @@ -253,11 +287,19 @@ class Chunker implements Iterator, Iterable { //get the length in utf8 bytes of the read chars int segmentSize = chunkSegment.toString().getBytes(UTF_8).length; + // lower case the string and get it's size. NOTE: lower casing can + // change the size of the string! + String lowerCasedSegment = chunkSegment.toString().toLowerCase(); + int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length; + //if it will not put us past maxBytes - if (chunkSizeBytes + segmentSize < maxBytes) { + if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) { //add it to the chunk currentSegment.append(chunkSegment); chunkSizeBytes += segmentSize; + + currentLowerCasedSegment.append(lowerCasedSegment); + lowerCasedChunkSizeBytes += lowerCasedSegmentSize; } else { //unread it, and break out of read loop. reader.unread(tempChunkBuf, 0, charsRead); @@ -275,11 +317,12 @@ class Chunker implements Iterator, Iterable { * * @throws IOException */ - private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException { + private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException { int charsRead = 0; boolean whitespaceFound = false; //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader. - while ((chunkSizeBytes < maxBytes) + while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES) + && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES) && (whitespaceFound == false) && (endOfReaderReached == false)) { charsRead = reader.read(tempChunkBuf, 0, 1); @@ -314,6 +357,12 @@ class Chunker implements Iterator, Iterable { //add read chars to the chunk and update the length. currentChunk.append(sanitizedChunkSegment); chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length; + + // lower case the string and get it's size. NOTE: lower casing can + // change the size of the string. + String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase(); + lowerCasedChunk.append(lowerCasedSegment); + lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length; } } } @@ -326,16 +375,16 @@ class Chunker implements Iterator, Iterable { private final StringBuilder sb; private final int baseChunkSizeChars; - private final int chunkSizeBytes; + private final StringBuilder lowerCasedChunk; - Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) { + Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) { this.sb = sb; this.baseChunkSizeChars = baseChunkSizeChars; - this.chunkSizeBytes = chunkSizeBytes; + this.lowerCasedChunk = lowerCasedChunk; } /** - * Get the content of the chunk. + * Get the content of the original (non-lower cased) chunk. * * @return The content of the chunk. */ @@ -345,16 +394,16 @@ class Chunker implements Iterator, Iterable { } /** - * Get the size in bytes of the utf-8 encoding of the entire chunk. + * Get the content of the lower cased chunk. * - * @return the size in bytes of the utf-8 encoding of the entire chunk + * @return The content of the chunk. */ - public int getChunkSizeBytes() { - return chunkSizeBytes; + public String geLowerCasedChunk() { + return lowerCasedChunk.toString(); } /** - * Get the length of the base chunk in java chars. + * Get the length of the original (non-lower cased) base chunk in java chars. * * @return the length of the base chunk in java chars. */ diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index be0b93088d..576b65d581 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -97,7 +97,7 @@ class Ingester { * file, but the Solr server is probably fine. */ void indexMetaDataOnly(AbstractFile file) throws IngesterException { - indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file))); + indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file))); } /** @@ -111,7 +111,7 @@ class Ingester { * artifact, but the Solr server is probably fine. */ void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException { - indexChunk("", sourceName, new HashMap<>(getContentFields(artifact))); + indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact))); } /** @@ -156,7 +156,7 @@ class Ingester { logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName); return false; } - + Chunk chunk = chunker.next(); Map fields = new HashMap<>(contentFields); String chunkId = Server.getChunkIdString(sourceID, numChunks + 1); @@ -166,7 +166,7 @@ class Ingester { language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang)); try { //add the chunk text to Solr index - indexChunk(chunk.toString(), sourceName, fields); + indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields); // add mini chunk when there's a language specific field if (chunker.hasNext() && language.isPresent()) { languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get()); @@ -197,7 +197,7 @@ class Ingester { fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //"parent" docs don't have chunk_size fields.remove(Server.Schema.CHUNK_SIZE.toString()); - indexChunk(null, sourceName, fields); + indexChunk(null, null, sourceName, fields); } } return true; @@ -211,12 +211,13 @@ class Ingester { * 4.0.0), see if possible to stream with UpdateRequestHandler * * @param chunk The chunk content as a string, or null for metadata only + * @param lowerCasedChunk The lower cased chunk content as a string, or null for metadata only * @param fields * @param size * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - private void indexChunk(String chunk, String sourceName, Map fields) throws IngesterException { + private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map fields) throws IngesterException { if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { //JMTODO: actually if the we couldn't get the image id it is set to -1, // but does this really mean we don't want to index it? @@ -245,7 +246,7 @@ class Ingester { // insensitive substring/regular expression search. double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion()); if (indexSchemaVersion >= 2.1) { - updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : chunk.toLowerCase())); + updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk)); } TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");