Merge remote-tracking branch 'upstream/search_improvements' into open_case_resources

2025-07-16 17:57:43 +00:00 · 2017-01-23 11:59:34 -05:00 · 2017-01-23 11:59:34 -05:00 · 938edeb0ab
commit 938edeb0ab
parent 0158622631 64b07cb4d2
8 changed files with 357 additions and 250 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@ -39,6 +39,7 @@ import org.sleuthkit.datamodel.TskCoreException;
 * artifact's attributes.
 */
 class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
+
    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());

    /**
@ -82,26 +83,31 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
    }

    @Override
-     public boolean isDisabled() {
+    public boolean isDisabled() {
        return false;
-     }
+    }

-     @Override
-     public void logWarning(final String msg, Exception ex) {
+    @Override
+    public void logWarning(final String msg, Exception ex) {
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }

-    private InputStream getInputStream(BlackboardArtifact artifact) {
+    private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException {
        // Concatenate the string values of all attributes into a single
        // "content" string to be indexed.
        StringBuilder artifactContents = new StringBuilder();

+        Content dataSource = null;
        try {
-            Content dataSource = getDataSource(artifact);
-            if (dataSource == null) {
-                return null;
-            }
+            dataSource = getDataSource(artifact);
+        } catch (TskCoreException tskCoreException) {
+            throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
+        }
+        if (dataSource == null) {
+            throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
+        }

+        try {
            for (BlackboardAttribute attribute : artifact.getAttributes()) {
                artifactContents.append(attribute.getAttributeType().getDisplayName());
                artifactContents.append(" : ");
@ -119,18 +125,15 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
                }
                artifactContents.append(System.lineSeparator());
            }
-        } catch (TskCoreException ex) {
-            logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex);
-            return null;
-        }
-        if (artifactContents.length() == 0) {
-            return null;
+        } catch (TskCoreException tskCoreException) {
+            throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
        }
+
        return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
    }

    @Override
-    public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
+    public Reader getReader(BlackboardArtifact source) throws TextExtractorException {
        return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
    }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
@ -0,0 +1,310 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2016 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import javax.annotation.concurrent.NotThreadSafe;
+import org.sleuthkit.autopsy.coreutils.TextUtil;
+import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
+
+/**
+ * Encapsulates the content chunking algorithm in an implementation of the
+ * Iterator interface. Also implements Iterable so it can be used directly in a
+ * for loop. The base chunk is the part of the chunk before the overlapping
+ * window. The window will be included at the end of the current chunk as well
+ * as at the beginning of the next chunk.
+ */
+@NotThreadSafe
+class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
+
+    //Chunking algorithm paramaters-------------------------------------//
+    /** the maximum size of a chunk, including the window. */
+    private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
+    /** the minimum to read before we start the process of looking for
+     * whitespace to break at and creating an overlapping window. */
+    private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
+    /** The maximum size of the chunk, before the overlapping window, even if we
+     * couldn't find whitespace to break at. */
+    private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
+    /** The amount of text we will read through before we give up on finding
+     * whitespace to break the chunk/window at. */
+    private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
+    /** The number of characters to read in one go from the Reader. */
+    private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
+
+    ////chunker state--------------------------------------------///
+    /** The Reader that this chunk reads from, and divides into chunks. It must
+     * be a buffered reader to ensure that mark/reset are supported. */
+    private final PushbackReader reader;
+    /** The local buffer of characters read from the Reader. */
+    private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
+
+    /** the size in bytes of the chunk (so far). */
+    private int chunkSizeBytes = 0;
+    /** Has the chunker reached the end of the Reader? If so, there are no more
+     * chunks, and the current chunk does not need a window. */
+    private boolean endOfReaderReached = false;
+    /** Store any exception encountered reading from the Reader. */
+    private Exception ex;
+
+    /**
+     * Create a Chunker that will chunk the content of the given Reader.
+     *
+     * @param reader The content to chunk.
+     */
+    Chunker(Reader reader) {
+        //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
+        this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
+    }
+
+    @Override
+    public Iterator<Chunk> iterator() {
+        return this;
+    }
+
+    /**
+     * Has this Chunker encountered an exception reading from the Reader.
+     */
+    boolean hasException() {
+        return ex != null;
+    }
+
+    /**
+     * Get the exception encountered reading from the Reader.
+     *
+     * @return The exception, or null if no exception was encountered.
+     */
+    public Exception getException() {
+        return ex;
+    }
+
+    @Override
+    public boolean hasNext() {
+        return (ex == null)
+                && (endOfReaderReached == false);
+    }
+
+    /**
+     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
+     * caret '^'
+     *
+     * @param sb the StringBuilder to sanitize
+     *
+     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
+     * function?
+     */
+    private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
+        final int length = sb.length();
+        for (int i = 0; i < length; i++) {
+            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
+                sb.replace(i, i + 1, "^");
+            }
+        }
+        return sb;
+    }
+
+    @Override
+    public Chunk next() {
+        if (hasNext() == false) {
+            throw new NoSuchElementException("There are no more chunks.");
+        }
+        //reset state for the next chunk
+
+        chunkSizeBytes = 0;
+        int baseChunkSizeChars = 0;
+        StringBuilder currentChunk = new StringBuilder();
+        StringBuilder currentWindow = new StringBuilder();
+
+        try {
+            currentChunk.append(readBaseChunk());
+            baseChunkSizeChars = currentChunk.length(); //save the base chunk length
+            currentWindow.append(readWindow());
+            if (endOfReaderReached) {
+                /* if we have reached the end of the content,we won't make
+                 * another overlapping chunk, so the length of the base chunk
+                 * can be extended to the end. */
+                baseChunkSizeChars = currentChunk.length();
+            } else {
+                /* otherwise we will make another chunk, so unread the window */
+                reader.unread(currentWindow.toString().toCharArray());
+            }
+        } catch (Exception ioEx) {
+            /* Save the exception, which will cause hasNext() to return false,
+             * and break any chunking loop in client code. */
+            ex = ioEx;
+        }
+        //add the window text to the current chunk.
+        currentChunk.append(currentWindow);
+        //sanitize the text and return a Chunk object, that includes the base chunk length.
+        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
+    }
+
+    /**
+     * Read the base chunk from the reader, attempting to break at whitespace.
+     *
+     * @throws IOException if there is a problem reading from the reader.
+     */
+    private StringBuilder readBaseChunk() throws IOException {
+        StringBuilder currentChunk = new StringBuilder();
+        //read the chunk until the minimum base chunk size
+        readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
+
+        //keep reading until the maximum base chunk size or white space is reached.
+        readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
+        return currentChunk;
+    }
+
+    /**
+     * Read the window from the reader, attempting to break at whitespace.
+     *
+     * @throws IOException if there is a problem reading from the reader.
+     */
+    private StringBuilder readWindow() throws IOException {
+        StringBuilder currentWindow = new StringBuilder();
+        //read the window, leaving some room to look for white space to break at.
+        readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
+
+        //keep reading until the max chunk size, or until whitespace is reached.
+        readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
+        return currentWindow;
+    }
+
+    /**
+     * Read until the maxBytes reached, or end of reader.
+     *
+     * @param maxBytes
+     * @param currentSegment
+     *
+     * @throws IOException
+     */
+    private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
+        int charsRead = 0;
+        //read chars up to maxBytes, or the end of the reader.
+        while ((chunkSizeBytes < maxBytes)
+                && (endOfReaderReached == false)) {
+            charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
+            if (-1 == charsRead) {
+                //this is the last chunk
+                endOfReaderReached = true;
+                return;
+            } else {
+                //if the last char might be part of a surroate pair, unread it.
+                final char lastChar = tempChunkBuf[charsRead - 1];
+                if (Character.isHighSurrogate(lastChar)) {
+                    charsRead--;
+                    reader.unread(lastChar);
+                }
+
+                String chunkSegment = new String(tempChunkBuf, 0, charsRead);
+
+                //get the length in bytes of the read chars
+                int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length;
+
+                //if it will not put us past maxBytes
+                if (chunkSizeBytes + segmentSize < maxBytes) {
+                    //add it to the chunk
+                    currentSegment.append(chunkSegment);
+                    chunkSizeBytes += segmentSize;
+                } else {
+                    //unread it, and break out of read loop.
+                    reader.unread(tempChunkBuf, 0, charsRead);
+                    return;
+                }
+            }
+        }
+    }
+
+    /**
+     * Read until the maxBytes reached, whitespace, or end of reader.
+     *
+     * @param maxBytes
+     * @param currentSegment
+     *
+     * @throws IOException
+     */
+    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
+        int charsRead = 0;
+        boolean whitespaceFound = false;
+        //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
+        while ((chunkSizeBytes < maxBytes)
+                && (whitespaceFound == false)
+                && (endOfReaderReached == false)) {
+            charsRead = reader.read(tempChunkBuf, 0, 1);
+            if (-1 == charsRead) {
+                //this is the last chunk
+                endOfReaderReached = true;
+                return;
+            } else {
+                //if the last charcter might be part of a surroate pair, read another char
+                final char ch = tempChunkBuf[0];
+                String chunkSegment;
+                if (Character.isHighSurrogate(ch)) {
+                    charsRead = reader.read(tempChunkBuf, 1, 1);
+                    if (charsRead == -1) {
+                        //this is the last chunk, so include the unpaired surrogate
+                        currentSegment.append(ch);
+                        chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length;
+                        endOfReaderReached = true;
+                        return;
+                    } else {
+                        //use the surrogate pair in place of the unpaired surrogate.
+                        chunkSegment = new String(tempChunkBuf, 0, 2);
+                    }
+                } else {
+                    //one char
+                    chunkSegment = new String(tempChunkBuf, 0, 1);
+                }
+                //check for whitespace.
+                whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0));
+                //add read chars to the chunk and update the length.
+                currentSegment.append(chunkSegment);
+                chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length;
+            }
+        }
+    }
+
+    /**
+     * Represents one chunk as the text in it and the length of the base chunk,
+     * in chars.
+     */
+    static class Chunk {
+
+        private final StringBuilder sb;
+        private final int chunksize;
+
+        Chunk(StringBuilder sb, int baseChunkLength) {
+            this.sb = sb;
+            this.chunksize = baseChunkLength;
+        }
+
+        @Override
+        public String toString() {
+            return sb.toString();
+        }
+
+        int getBaseChunkLength() {
+            return chunksize;
+        }
+    }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -95,7 +95,7 @@ abstract class FileTextExtractor implements TextExtractor< AbstractFile> {
    abstract boolean isSupported(AbstractFile file, String detectedFormat);

    @Override
-    public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;
+    public abstract Reader getReader(AbstractFile source) throws TextExtractorException;

    @Override
    public long getID(AbstractFile source) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -63,7 +63,7 @@ class HtmlTextExtractor extends FileTextExtractor {
    }

    @Override
-    public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
+    public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {
        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);

        //Parse the stream with Jericho and put the results in a Reader
@ -157,7 +157,7 @@ class HtmlTextExtractor extends FileTextExtractor {
            // All done, now make it a reader
            return new StringReader(stringBuilder.toString());
        } catch (IOException ex) {
-            throw new Ingester.IngesterException("Error extracting HTML from content.", ex);
+            throw new TextExtractorException("Error extracting HTML from content.", ex);
        }
    }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -18,22 +18,17 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import com.google.common.base.Utf8;
 import java.io.BufferedReader;
-import java.io.IOException;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.Map;
-import java.util.NoSuchElementException;
 import java.util.logging.Level;
-import javax.annotation.concurrent.NotThreadSafe;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.common.SolrInputDocument;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.TextUtil;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
+import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.DerivedFile;
@ -148,8 +143,8 @@ class Ingester {
        int numChunks = 0; //unknown until chunking is done

        if (extractor.isDisabled()) {
-            /* some Extrctors, notable the strings extractor, have options which
-             * can be configured such that no extraction should be done */
+            /* some Extractors, notable the strings extractor, have options
+             * which can be configured such that no extraction should be done */
            return true;
        }

@ -170,13 +165,12 @@ class Ingester {
                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS

                    throw ingEx; //need to rethrow to signal error and move on
-                } catch (Exception ex) {
-                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
                }
            }
-        } catch (IOException ex) {
-            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
-            return false;
+            if (chunker.hasException()) {
+                extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
+                return false;
+            }
        } catch (Exception ex) {
            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
@ -191,7 +185,7 @@ class Ingester {
    }

    /**
-     * Add one chunk as to the Solr index as a seperate sold document.
+     * Add one chunk as to the Solr index as a separate Solr document.
     *
     * TODO see if can use a byte or string streaming way to add content to
     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
@ -231,7 +225,7 @@ class Ingester {
            uncommitedIngests = true;

        } catch (KeywordSearchModuleException ex) {
-            //JMTODO: does this need to ne internationalized?
+            //JMTODO: does this need to be internationalized?
            throw new IngesterException(
                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
        }
@ -369,211 +363,3 @@ class Ingester {
        }
    }
 }
-
-/**
- * Encapsulates the content chunking algorithm in an implementation of the
- * Iterator interface. Also implements Iterable so it can be used directly in a
- * for loop. The base chunk is the part of the chunk before the overlapping
- * window. The window will be included at the end of the current chunk as well
- * as at the beginning of the next chunk.
- */
-@NotThreadSafe
-class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
-
-    //Chunking algorithm paramaters-------------------------------------//
-    /** the maximum size of a chunk, including the window. */
-    private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
-    /** the minimum to read before we start the process of looking for
-     * whitespace to break at and creating an overlapping window. */
-    private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
-    /** The maximum size of the chunk, before the overlapping window, even if we
-     * couldn't find whitespace to break at. */
-    private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
-    /** The amount of text we will read through before we give up on finding
-     * whitespace to break the chunk/window at. */
-    private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
-    /** The number of characters to read in one go from the Reader. */
-    private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
-
-    ////chunker state--------------------------------------------///
-    /** The Reader that this chunk reads from, and divides into chunks. It must
-     * be a buffered reader to ensure that mark/reset are supported. */
-    private final BufferedReader reader;
-    /** The local buffer of characters read from the Reader. */
-    private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
-    /** number of chars read in the most recent read operation. */
-    private int charsRead = 0;
-
-    /** The text of the current chunk (so far). */
-    private StringBuilder currentChunk;
-    /** the size in bytes of the chunk (so far). */
-    private int chunkSizeBytes = 0;
-    /** the size in chars of the (base) chunk (so far). */
-    private int baseChunkSizeChars;
-
-    /** has the chunker found whitespace to break on? */
-    private boolean whitespaceFound = false;
-    /** has the chunker reached the end of the Reader? If so, there are no more
-     * chunks, and the current chunk does not need a window. */
-    private boolean endOfReaderReached = false;
-
-    /**
-     * Create a Chunker that will chunk the content of the given Reader.
-     *
-     * @param reader The content to chunk.
-     */
-    Chunker(BufferedReader reader) {
-        this.reader = reader;
-    }
-
-    @Override
-    public Iterator<Chunk> iterator() {
-        return this;
-    }
-
-    @Override
-    public boolean hasNext() {
-        return endOfReaderReached == false;
-    }
-
-    /**
-     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
-     * caret '^'
-     *
-     * @param sb the StringBuilder to sanitize
-     *
-     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
-     * function?
-     */
-    private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
-        final int length = sb.length();
-        for (int i = 0; i < length; i++) {
-            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
-                sb.replace(i, i + 1, "^");
-            }
-        }
-        return sb;
-    }
-
-    @Override
-    public Chunk next() {
-        if (endOfReaderReached) {
-            throw new NoSuchElementException("There are no more chunks.");
-        }
-        //reset state for the next chunk
-        currentChunk = new StringBuilder();
-        chunkSizeBytes = 0;
-        baseChunkSizeChars = 0;
-
-        try {
-            readBaseChunk();
-            baseChunkSizeChars = currentChunk.length();
-            reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk
-            readWindow();
-        } catch (IOException ioEx) {
-            throw new RuntimeException("IOException while reading chunk.", ioEx);
-        }
-        try {
-            reader.reset(); //reset the reader the so the next chunk can begin at the position marked above
-        } catch (IOException ex) {
-            throw new RuntimeException("IOException while resetting chunk reader.", ex);
-        }
-
-        if (endOfReaderReached) {
-            /* if we have reached the end of the content,we won't make another
-             * overlapping chunk, so the base chunk can be extended to the end. */
-            baseChunkSizeChars = currentChunk.length();
-        }
-        //sanitize the text and return a Chunk object, that includes the base chunk length.
-        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
-    }
-
-    /**
-     * Read the base chunk from the reader, and attempt to break at whitespace.
-     *
-     * @throws IOException if there is a problem reading from the reader.
-     */
-    private void readBaseChunk() throws IOException {
-        //read the chunk until the minimum base chunk size
-        readHelper(MINIMUM_BASE_CHUNK_SIZE, false);
-        //keep reading until the maximum base chunk size or white space is reached.
-        whitespaceFound = false;
-        readHelper(MAXIMUM_BASE_CHUNK_SIZE, true);
-
-    }
-
-    /**
-     * Read the window from the reader, and attempt to break at whitespace.
-     *
-     * @throws IOException if there is a problem reading from the reader.
-     */
-    private void readWindow() throws IOException {
-        //read the window, leaving some room to look for white space to break at.
-        int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
-        readHelper(windowEnd, false);
-        whitespaceFound = false;
-        //keep reading until the max chunk size, or until whitespace is reached.
-        windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024);
-        readHelper(windowEnd, true);
-    }
-
-    /** Helper method that implements reading in a loop.
-     *
-     * @param maxBytes           The max cummulative length of the content,in
-     *                           bytes, to read from the Reader. That is, when
-     *                           chunkSizeBytes >= maxBytes stop reading.
-     * @param inWhiteSpaceBuffer Should the current read stop once whitespace is
-     *                           found?
-     *
-     * @throws IOException If there is a problem reading from the Reader.
-     */
-    private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException {
-        //only read one character at a time if we are looking for whitespace.
-        final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE;
-
-        //read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader.
-        while ((chunkSizeBytes < maxBytes)
-                && (false == (inWhiteSpaceBuffer && whitespaceFound))
-                && (endOfReaderReached == false)) {
-            charsRead = reader.read(tempChunkBuf, 0, readSize);
-            if (-1 == charsRead) {
-                //this is the last chunk
-                endOfReaderReached = true;
-            } else {
-                if (inWhiteSpaceBuffer) {
-                    //chec for whitespace.
-                    whitespaceFound = Character.isWhitespace(tempChunkBuf[0]);
-                }
-
-                //add read chars to the chunk and update the length.
-                String chunkSegment = new String(tempChunkBuf, 0, charsRead);
-                chunkSizeBytes += Utf8.encodedLength(chunkSegment);
-                currentChunk.append(chunkSegment);
-            }
-        }
-    }
-}
-
-/**
- * Represents one chunk as the text in it and the length of the base chunk, in
- * chars.
- */
-class Chunk {
-
-    private final StringBuilder sb;
-    private final int chunksize;
-
-    Chunk(StringBuilder sb, int baseChunkLength) {
-        this.sb = sb;
-        this.chunksize = baseChunkLength;
-    }
-
-    @Override
-    public String toString() {
-        return sb.toString();
-    }
-
-    int getBaseChunkLength() {
-        return chunksize;
-    }
-}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -108,7 +108,7 @@ class StringsTextExtractor extends FileTextExtractor {
    }

    @Override
-    public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
+    public InputStreamReader getReader(AbstractFile sourceFile) throws TextExtractorException {
        InputStream stringStream = getInputStream(sourceFile);
        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -30,7 +30,6 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 */
 interface TextExtractor< TextSource extends SleuthkitVisitableItem> {

-
    /**
     * Is this extractor configured such that no extraction will/should be done?
     *
@ -46,7 +45,6 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     */
    abstract void logWarning(String msg, Exception ex);

-
    /**
     * Get a reader that over the text extracted from the given source.
     *
@ -57,7 +55,7 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    abstract Reader getReader(TextSource source) throws Ingester.IngesterException;
+    abstract Reader getReader(TextSource source) throws TextExtractorException;

    /**
     * Get the 'object' id of the given source.
@ -76,4 +74,15 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     * @return
     */
    abstract String getName(TextSource source);
+
+    class TextExtractorException extends Exception {
+
+        public TextExtractorException(String message) {
+            super(message);
+        }
+
+        public TextExtractorException(String message, Throwable cause) {
+            super(message, cause);
+        }
+    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -36,7 +36,6 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;

@ -62,7 +61,7 @@ class TikaTextExtractor extends FileTextExtractor {
    }

    @Override
-    public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+    public Reader getReader(AbstractFile sourceFile) throws TextExtractorException, MissingResourceException {
        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);

        Metadata metadata = new Metadata();
@ -76,12 +75,12 @@ class TikaTextExtractor extends FileTextExtractor {
        } catch (TimeoutException te) {
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
            logWarning(msg, te);
-            throw new IngesterException(msg);
+            throw new TextExtractorException(msg, te);
        } catch (Exception ex) {
            KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
            logWarning(msg, ex);
-            throw new IngesterException(msg, ex);
+            throw new TextExtractorException(msg, ex);
        }
    }