first pass at overlapping chunks

2025-07-12 07:56:16 +00:00 · 2016-12-19 11:18:24 +01:00 · 2016-12-19 11:18:24 +01:00 · 94e136b451
commit 94e136b451
parent d14c15fbdb
3 changed files with 136 additions and 88 deletions
--- a/KeywordSearch/release/solr/solr/configsets/AutopsyConfig/conf/schema.xml
+++ b/KeywordSearch/release/solr/solr/configsets/AutopsyConfig/conf/schema.xml
@ -526,6 +526,7 @@
   <!-- file chunk-specific fields (optional for others) -->
   <!-- for a parent file with no content, number of chunks are specified -->
   <field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
+   <field name="chunk_size" type="int" indexed="true" stored="true" required="false" />
   
   <!-- Common metadata fields, named specifically to match up with
     SolrCell metadata when parsing rich documents such as Word, PDF.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -58,7 +58,6 @@ class Ingester {
    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
    private static Ingester instance;

-
    private Ingester() {
    }

@ -160,12 +159,13 @@ class Ingester {

            Chunker chunker = new Chunker(reader);

-            for (Chunk chunk : chunker) {
+            for (StringBuilder chunk : chunker) {
                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                fields.put(Server.Schema.ID.toString(), chunkId);
+                fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.length()));
                try {
                    //add the chunk text to Solr index
-                    indexChunk(chunk.getText().toString(), sourceName, fields);
+                    indexChunk(chunk.toString(), sourceName, fields);
                    numChunks++;
                } catch (Ingester.IngesterException ingEx) {
                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
@ -370,41 +370,25 @@ class Ingester {
            super(message);
        }
    }
-
-}
-
-class Chunk {
-    private final StringBuilder sb;
-    private final int chunksize;
-
-    Chunk(StringBuilder sb, int chunksize) {
-        this.sb = sb;
-        this.chunksize = chunksize;
-    }
-
-    StringBuilder getText() {
-        return sb;
-    }
-
-    int getSize() {
-        return chunksize;
-    }
 }

 /**
 * Encapsulates the content chunking algorithm in implementation of the Iterator
 * interface.
 */
-class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
+class Chunker implements Iterator<StringBuilder>, Iterable<StringBuilder> {

-    private static final int INITIAL_CHUNK_SIZE = 32 * 1024; //bytes
+    private static final int INITIAL_CHUNK_SIZE = 30 * 1024; //bytes
+    private static final int WHITE_SPACE_BUFFER_SIZE = 900; //bytes
+    private static final int MAX_WINDOW_SIZE = 1024; //bytes
    private static final int SINGLE_READ_CHARS = 1024;

-    private int chunkSizeBytes = 0;  // the size in bytes of chunk (so far)
+    private int windowSizeBytes = 0;
+    private int chunkSizeBytes = 0;  // the size in bytes of the base chunk (so far)
    private int charsRead = 0;  // number of chars read in the most recent read operation
    private boolean whitespace = false;
    private char[] tempChunkBuf;
-    private StringBuilder chunkText;
+    private StringBuilder chunk;
    private boolean endOfContent = false;
    private final Reader reader;

@ -418,7 +402,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
    }

    @Override
-    public Iterator<Chunk> iterator() {
+    public Iterator<StringBuilder> iterator() {
        return this;
    }

@ -433,64 +417,6 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
        return endOfContent == false;
    }

-    @Override
-    public Chunk next() {
-        if (hasNext()) {
-            chunkText = new StringBuilder();
-            tempChunkBuf = new char[SINGLE_READ_CHARS];
-            chunkSizeBytes = 0;
-            //read chars up to initial chunk size
-            while (chunkSizeBytes < INITIAL_CHUNK_SIZE && endOfContent == false) {
-                try {
-                    charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
-                } catch (IOException ex) {
-                    throw new RuntimeException("IOException while attempting to read chunk.", ex);
-                }
-                    if (-1 == charsRead) {
-                        //this is the last chunk
-                        endOfContent = true;
-                    } else {
-                        String chunkSegment = new String(tempChunkBuf, 0, charsRead);
-                        chunkSizeBytes += Utf8.encodedLength(chunkSegment);
-                        chunkText.append(chunkSegment);
-                    }
-
-            }
-            if (false == endOfContent) {
-                endOfContent = readChunkUntilWhiteSpace();
-            }
-            return new Chunk(sanitizeToUTF8(chunkText), chunkSizeBytes);
-        } else {
-            throw new NoSuchElementException("There are no more chunks.");
-        }
-    }
-
-
-    private boolean readChunkUntilWhiteSpace() {
-        charsRead = 0;
-        whitespace = false;
-        //if we haven't reached the end of the file,
-        //try to read char-by-char until whitespace to not break words
-        while ((chunkSizeBytes < INITIAL_CHUNK_SIZE)
-                && (false == whitespace)) {
-            try {
-                charsRead = reader.read(tempChunkBuf, 0, 1);
-            } catch (IOException ex) {
-                throw new RuntimeException("IOException while attempting to read chunk until whitespace.", ex);
-            }
-            if (-1 == charsRead) {
-                //this is the last chunk
-                return true;
-            } else {
-                whitespace = Character.isWhitespace(tempChunkBuf[0]);
-                String chunkSegment = new String(tempChunkBuf, 0, 1);
-                chunkSizeBytes += Utf8.encodedLength(chunkSegment);
-                chunkText.append(chunkSegment);
-            }
-        }
-        return false;
-    }
-
    /**
     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
     * caret '^'
@ -512,4 +438,121 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
        }
        return sb;
    }
+
+
+    @Override
+    public StringBuilder next() {
+        if (hasNext()) {
+            chunk = new StringBuilder();
+            tempChunkBuf = new char[SINGLE_READ_CHARS];
+            chunkSizeBytes = 0;
+            windowSizeBytes = 0;
+            //read chars up to initial chunk size
+            while (chunkSizeBytes < INITIAL_CHUNK_SIZE && endOfContent == false) {
+                try {
+                    charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
+                } catch (IOException ex) {
+                    throw new RuntimeException("IOException while attempting to read chunk.", ex);
+                }
+                if (-1 == charsRead) {
+                    //this is the last chunk
+                    endOfContent = true;
+                } else {
+                    String chunkSegment = new String(tempChunkBuf, 0, charsRead);
+                    chunkSizeBytes += Utf8.encodedLength(chunkSegment);
+                    chunk.append(chunkSegment);
+                }
+            }
+            if (false == endOfContent) {
+                try {
+                    endOfContent = readChunkUntilWhiteSpace();
+                } catch (IOException ex) {
+                    throw new RuntimeException("IOException while attempting to read chunk to white space.", ex);
+                }
+            }
+            return sanitizeToUTF8(chunk);
+        } else {
+            throw new NoSuchElementException("There are no more chunks.");
+        }
+    }
+
+    private boolean readWindow() throws IOException {
+        tempChunkBuf = new char[MAX_WINDOW_SIZE];
+        charsRead = 0;
+        while (windowSizeBytes < MAX_WINDOW_SIZE) {
+            charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
+            if (-1 == charsRead) {
+                //this is the last chunk
+                return true;
+            } else {
+                String windowSegment = new String(tempChunkBuf, 0, charsRead);
+                windowSizeBytes += Utf8.encodedLength(windowSegment);
+                chunk.append(windowSegment);
+            }
+        }
+        return readWindowUntilWhiteSpace();
+    }
+
+    private boolean readChunkUntilWhiteSpace() throws IOException {
+        charsRead = 0;
+        whitespace = false;
+        //if we haven't reached the end of the file,
+        //try to read char-by-char until whitespace to not break words
+        while ((chunkSizeBytes < INITIAL_CHUNK_SIZE + WHITE_SPACE_BUFFER_SIZE)
+                && (false == whitespace)) {
+
+            charsRead = reader.read(tempChunkBuf, 0, 1);
+            if (-1 == charsRead) {
+                //this is the last chunk
+                return true;
+            } else {
+                whitespace = Character.isWhitespace(tempChunkBuf[0]);
+                String chunkSegment = new String(tempChunkBuf, 0, 1);
+                chunkSizeBytes += Utf8.encodedLength(chunkSegment);
+                chunk.append(chunkSegment);
+            }
+        }
+        return readWindow();
+    }
+
+    private boolean readWindowUntilWhiteSpace() throws IOException {
+        tempChunkBuf = new char[1];
+        charsRead = 0;
+        whitespace = false;
+        //if we haven't reached the end of the file,
+        //try to read char-by-char until whitespace to not break words
+        while ((windowSizeBytes < MAX_WINDOW_SIZE)
+                && (false == whitespace)) {
+            charsRead = reader.read(tempChunkBuf, 0, 1);
+            if (-1 == charsRead) {
+                //this is the last chunk
+                return true;
+            } else {
+                whitespace = Character.isWhitespace(tempChunkBuf[0]);
+                String windowSegment = new String(tempChunkBuf, 0, 1);
+                windowSizeBytes += Utf8.encodedLength(windowSegment);
+                chunk.append(windowSegment);
+            }
+        }
+        return false;
+    }
+}
+
+class Chunk {
+
+    private final StringBuilder sb;
+    private final int chunksize;
+
+    Chunk(StringBuilder sb, int chunksize) {
+        this.sb = sb;
+        this.chunksize = chunksize;
+    }
+
+    StringBuilder getText() {
+        return sb;
+    }
+
+    int getSize() {
+        return chunksize;
+    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
@ -47,7 +47,6 @@ import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
 import org.apache.solr.client.solrj.impl.XMLResponseParser;
 import org.apache.solr.client.solrj.request.CoreAdminRequest;
-import org.apache.solr.client.solrj.response.CoreAdminResponse;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.response.TermsResponse;
 import org.apache.solr.common.SolrDocument;
@ -148,6 +147,11 @@ public class Server {
            public String toString() {
                return "num_chunks"; //NON-NLS
            }
+        },
+        CHUNK_SIZE {
+            public String toString() {
+                return "chunk_size"; //NON-NLS
+            }
        }
    };

@ -731,7 +735,7 @@ public class Server {
        }
    }

-    NamedList<Object> request(SolrRequest request) throws SolrServerException, NoOpenCoreException {
+    NamedList<Object> request(SolrRequest<?> request) throws SolrServerException, NoOpenCoreException {
        currentCoreLock.readLock().lock();
        try {
            if (null == currentCore) {
@ -1222,7 +1226,7 @@ public class Server {
            return solrCore.query(sq);
        }

-        private NamedList<Object> request(SolrRequest request) throws SolrServerException {
+        private NamedList<Object> request(SolrRequest<?> request) throws SolrServerException {
            try {
                return solrCore.request(request);
            } catch (IOException e) {