Merge branch 'develop' of https://github.com/sleuthkit/autopsy into 2197-ProfileOptionsPanel

2025-07-16 17:57:43 +00:00 · 2017-01-23 16:48:27 -05:00 · 2017-01-23 16:48:27 -05:00 · c1888c88c3
commit c1888c88c3
parent 96fb0438c7 5f13b2fe9f
8 changed files with 357 additions and 250 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@ -39,6 +39,7 @@ import org.sleuthkit.datamodel.TskCoreException;
 * artifact's attributes.
 */
 class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
    /**
@ -91,17 +92,22 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }
-    private InputStream getInputStream(BlackboardArtifact artifact) {
+    private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException {
        // Concatenate the string values of all attributes into a single
        // "content" string to be indexed.
        StringBuilder artifactContents = new StringBuilder();
        Content dataSource = null;
        try {
-            Content dataSource = getDataSource(artifact);
+            dataSource = getDataSource(artifact);
        } catch (TskCoreException tskCoreException) {
            throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
        }
        if (dataSource == null) {
-                return null;
+            throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
        }
        try {
            for (BlackboardAttribute attribute : artifact.getAttributes()) {
                artifactContents.append(attribute.getAttributeType().getDisplayName());
                artifactContents.append(" : ");
@ -119,18 +125,15 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
                }
                artifactContents.append(System.lineSeparator());
            }
-        } catch (TskCoreException ex) {
+        } catch (TskCoreException tskCoreException) {
-            logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex);
+            throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
            return null;
        }
        if (artifactContents.length() == 0) {
            return null;
        }
        return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
    }
    @Override
-    public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
+    public Reader getReader(BlackboardArtifact source) throws TextExtractorException {
        return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java
@ -0,0 +1,310 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.PushbackReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
 import java.util.NoSuchElementException;
 import javax.annotation.concurrent.NotThreadSafe;
 import org.sleuthkit.autopsy.coreutils.TextUtil;
 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
 /**
 * Encapsulates the content chunking algorithm in an implementation of the
 * Iterator interface. Also implements Iterable so it can be used directly in a
 * for loop. The base chunk is the part of the chunk before the overlapping
 * window. The window will be included at the end of the current chunk as well
 * as at the beginning of the next chunk.
 */
@NotThreadSafe
 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
    //Chunking algorithm paramaters-------------------------------------//
    /** the maximum size of a chunk, including the window. */
    private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
    /** the minimum to read before we start the process of looking for
     * whitespace to break at and creating an overlapping window. */
    private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
    /** The maximum size of the chunk, before the overlapping window, even if we
     * couldn't find whitespace to break at. */
    private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
    /** The amount of text we will read through before we give up on finding
     * whitespace to break the chunk/window at. */
    private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
    /** The number of characters to read in one go from the Reader. */
    private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
    ////chunker state--------------------------------------------///
    /** The Reader that this chunk reads from, and divides into chunks. It must
     * be a buffered reader to ensure that mark/reset are supported. */
    private final PushbackReader reader;
    /** The local buffer of characters read from the Reader. */
    private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
    /** the size in bytes of the chunk (so far). */
    private int chunkSizeBytes = 0;
    /** Has the chunker reached the end of the Reader? If so, there are no more
     * chunks, and the current chunk does not need a window. */
    private boolean endOfReaderReached = false;
    /** Store any exception encountered reading from the Reader. */
    private Exception ex;
    /**
     * Create a Chunker that will chunk the content of the given Reader.
     *
     * @param reader The content to chunk.
     */
    Chunker(Reader reader) {
        //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
        this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
    }
    @Override
    public Iterator<Chunk> iterator() {
        return this;
    }
    /**
     * Has this Chunker encountered an exception reading from the Reader.
     */
    boolean hasException() {
        return ex != null;
    }
    /**
     * Get the exception encountered reading from the Reader.
     *
     * @return The exception, or null if no exception was encountered.
     */
    public Exception getException() {
        return ex;
    }
    @Override
    public boolean hasNext() {
        return (ex == null)
                && (endOfReaderReached == false);
    }
    /**
     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
     * caret '^'
     *
     * @param sb the StringBuilder to sanitize
     *
     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
     * function?
     */
    private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
        final int length = sb.length();
        for (int i = 0; i < length; i++) {
            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
                sb.replace(i, i + 1, "^");
            }
        }
        return sb;
    }
    @Override
    public Chunk next() {
        if (hasNext() == false) {
            throw new NoSuchElementException("There are no more chunks.");
        }
        //reset state for the next chunk
        chunkSizeBytes = 0;
        int baseChunkSizeChars = 0;
        StringBuilder currentChunk = new StringBuilder();
        StringBuilder currentWindow = new StringBuilder();
        try {
            currentChunk.append(readBaseChunk());
            baseChunkSizeChars = currentChunk.length(); //save the base chunk length
            currentWindow.append(readWindow());
            if (endOfReaderReached) {
                /* if we have reached the end of the content,we won't make
                 * another overlapping chunk, so the length of the base chunk
                 * can be extended to the end. */
                baseChunkSizeChars = currentChunk.length();
            } else {
                /* otherwise we will make another chunk, so unread the window */
                reader.unread(currentWindow.toString().toCharArray());
            }
        } catch (Exception ioEx) {
            /* Save the exception, which will cause hasNext() to return false,
             * and break any chunking loop in client code. */
            ex = ioEx;
        }
        //add the window text to the current chunk.
        currentChunk.append(currentWindow);
        //sanitize the text and return a Chunk object, that includes the base chunk length.
        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
    }
    /**
     * Read the base chunk from the reader, attempting to break at whitespace.
     *
     * @throws IOException if there is a problem reading from the reader.
     */
    private StringBuilder readBaseChunk() throws IOException {
        StringBuilder currentChunk = new StringBuilder();
        //read the chunk until the minimum base chunk size
        readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
        //keep reading until the maximum base chunk size or white space is reached.
        readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
        return currentChunk;
    }
    /**
     * Read the window from the reader, attempting to break at whitespace.
     *
     * @throws IOException if there is a problem reading from the reader.
     */
    private StringBuilder readWindow() throws IOException {
        StringBuilder currentWindow = new StringBuilder();
        //read the window, leaving some room to look for white space to break at.
        readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
        //keep reading until the max chunk size, or until whitespace is reached.
        readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
        return currentWindow;
    }
    /**
     * Read until the maxBytes reached, or end of reader.
     *
     * @param maxBytes
     * @param currentSegment
     *
     * @throws IOException
     */
    private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
        int charsRead = 0;
        //read chars up to maxBytes, or the end of the reader.
        while ((chunkSizeBytes < maxBytes)
                && (endOfReaderReached == false)) {
            charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
            if (-1 == charsRead) {
                //this is the last chunk
                endOfReaderReached = true;
                return;
            } else {
                //if the last char might be part of a surroate pair, unread it.
                final char lastChar = tempChunkBuf[charsRead - 1];
                if (Character.isHighSurrogate(lastChar)) {
                    charsRead--;
                    reader.unread(lastChar);
                }
                String chunkSegment = new String(tempChunkBuf, 0, charsRead);
                //get the length in bytes of the read chars
                int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length;
                //if it will not put us past maxBytes
                if (chunkSizeBytes + segmentSize < maxBytes) {
                    //add it to the chunk
                    currentSegment.append(chunkSegment);
                    chunkSizeBytes += segmentSize;
                } else {
                    //unread it, and break out of read loop.
                    reader.unread(tempChunkBuf, 0, charsRead);
                    return;
                }
            }
        }
    }
    /**
     * Read until the maxBytes reached, whitespace, or end of reader.
     *
     * @param maxBytes
     * @param currentSegment
     *
     * @throws IOException
     */
    private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
        int charsRead = 0;
        boolean whitespaceFound = false;
        //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
        while ((chunkSizeBytes < maxBytes)
                && (whitespaceFound == false)
                && (endOfReaderReached == false)) {
            charsRead = reader.read(tempChunkBuf, 0, 1);
            if (-1 == charsRead) {
                //this is the last chunk
                endOfReaderReached = true;
                return;
            } else {
                //if the last charcter might be part of a surroate pair, read another char
                final char ch = tempChunkBuf[0];
                String chunkSegment;
                if (Character.isHighSurrogate(ch)) {
                    charsRead = reader.read(tempChunkBuf, 1, 1);
                    if (charsRead == -1) {
                        //this is the last chunk, so include the unpaired surrogate
                        currentSegment.append(ch);
                        chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length;
                        endOfReaderReached = true;
                        return;
                    } else {
                        //use the surrogate pair in place of the unpaired surrogate.
                        chunkSegment = new String(tempChunkBuf, 0, 2);
                    }
                } else {
                    //one char
                    chunkSegment = new String(tempChunkBuf, 0, 1);
                }
                //check for whitespace.
                whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0));
                //add read chars to the chunk and update the length.
                currentSegment.append(chunkSegment);
                chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length;
            }
        }
    }
    /**
     * Represents one chunk as the text in it and the length of the base chunk,
     * in chars.
     */
    static class Chunk {
        private final StringBuilder sb;
        private final int chunksize;
        Chunk(StringBuilder sb, int baseChunkLength) {
            this.sb = sb;
            this.chunksize = baseChunkLength;
        }
        @Override
        public String toString() {
            return sb.toString();
        }
        int getBaseChunkLength() {
            return chunksize;
        }
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -95,7 +95,7 @@ abstract class FileTextExtractor implements TextExtractor< AbstractFile> {
    abstract boolean isSupported(AbstractFile file, String detectedFormat);
    @Override
-    public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;
+    public abstract Reader getReader(AbstractFile source) throws TextExtractorException;
    @Override
    public long getID(AbstractFile source) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -65,7 +65,7 @@ class HtmlTextExtractor extends FileTextExtractor {
    }
    @Override
-    public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
+    public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {
        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
        //Parse the stream with Jericho and put the results in a Reader
@ -159,7 +159,7 @@ class HtmlTextExtractor extends FileTextExtractor {
            // All done, now make it a reader
            return new StringReader(stringBuilder.toString());
        } catch (IOException ex) {
-            throw new Ingester.IngesterException("Error extracting HTML from content.", ex);
+            throw new TextExtractorException("Error extracting HTML from content.", ex);
        }
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -18,22 +18,17 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import com.google.common.base.Utf8;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.logging.Level;
 import javax.annotation.concurrent.NotThreadSafe;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.common.SolrInputDocument;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.TextUtil;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.DerivedFile;
@ -149,8 +144,8 @@ class Ingester {
        int numChunks = 0; //unknown until chunking is done
        if (extractor.isDisabled()) {
-            /* some Extrctors, notable the strings extractor, have options which
+            /* some Extractors, notable the strings extractor, have options
-             * can be configured such that no extraction should be done */
+             * which can be configured such that no extraction should be done */
            return true;
        }
@ -171,13 +166,12 @@ class Ingester {
                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
                    throw ingEx; //need to rethrow to signal error and move on
                } catch (Exception ex) {
                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
                }
            }
-        } catch (IOException ex) {
+            if (chunker.hasException()) {
-            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
+                extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
                return false;
            }
        } catch (Exception ex) {
            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
@ -192,7 +186,7 @@ class Ingester {
    }
    /**
-     * Add one chunk as to the Solr index as a seperate sold document.
+     * Add one chunk as to the Solr index as a separate Solr document.
     *
     * TODO see if can use a byte or string streaming way to add content to
     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
@ -232,7 +226,7 @@ class Ingester {
            uncommitedIngests = true;
        } catch (KeywordSearchModuleException ex) {
-            //JMTODO: does this need to ne internationalized?
+            //JMTODO: does this need to be internationalized?
            throw new IngesterException(
                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
        }
@ -370,211 +364,3 @@ class Ingester {
        }
    }
 }
 /**
 * Encapsulates the content chunking algorithm in an implementation of the
 * Iterator interface. Also implements Iterable so it can be used directly in a
 * for loop. The base chunk is the part of the chunk before the overlapping
 * window. The window will be included at the end of the current chunk as well
 * as at the beginning of the next chunk.
 */
@NotThreadSafe
 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
    //Chunking algorithm paramaters-------------------------------------//
    /** the maximum size of a chunk, including the window. */
    private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
    /** the minimum to read before we start the process of looking for
     * whitespace to break at and creating an overlapping window. */
    private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
    /** The maximum size of the chunk, before the overlapping window, even if we
     * couldn't find whitespace to break at. */
    private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
    /** The amount of text we will read through before we give up on finding
     * whitespace to break the chunk/window at. */
    private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
    /** The number of characters to read in one go from the Reader. */
    private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
    ////chunker state--------------------------------------------///
    /** The Reader that this chunk reads from, and divides into chunks. It must
     * be a buffered reader to ensure that mark/reset are supported. */
    private final BufferedReader reader;
    /** The local buffer of characters read from the Reader. */
    private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
    /** number of chars read in the most recent read operation. */
    private int charsRead = 0;
    /** The text of the current chunk (so far). */
    private StringBuilder currentChunk;
    /** the size in bytes of the chunk (so far). */
    private int chunkSizeBytes = 0;
    /** the size in chars of the (base) chunk (so far). */
    private int baseChunkSizeChars;
    /** has the chunker found whitespace to break on? */
    private boolean whitespaceFound = false;
    /** has the chunker reached the end of the Reader? If so, there are no more
     * chunks, and the current chunk does not need a window. */
    private boolean endOfReaderReached = false;
    /**
     * Create a Chunker that will chunk the content of the given Reader.
     *
     * @param reader The content to chunk.
     */
    Chunker(BufferedReader reader) {
        this.reader = reader;
    }
    @Override
    public Iterator<Chunk> iterator() {
        return this;
    }
    @Override
    public boolean hasNext() {
        return endOfReaderReached == false;
    }
    /**
     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
     * caret '^'
     *
     * @param sb the StringBuilder to sanitize
     *
     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
     * function?
     */
    private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
        final int length = sb.length();
        for (int i = 0; i < length; i++) {
            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
                sb.replace(i, i + 1, "^");
            }
        }
        return sb;
    }
    @Override
    public Chunk next() {
        if (endOfReaderReached) {
            throw new NoSuchElementException("There are no more chunks.");
        }
        //reset state for the next chunk
        currentChunk = new StringBuilder();
        chunkSizeBytes = 0;
        baseChunkSizeChars = 0;
        try {
            readBaseChunk();
            baseChunkSizeChars = currentChunk.length();
            reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk
            readWindow();
        } catch (IOException ioEx) {
            throw new RuntimeException("IOException while reading chunk.", ioEx);
        }
        try {
            reader.reset(); //reset the reader the so the next chunk can begin at the position marked above
        } catch (IOException ex) {
            throw new RuntimeException("IOException while resetting chunk reader.", ex);
        }
        if (endOfReaderReached) {
            /* if we have reached the end of the content,we won't make another
             * overlapping chunk, so the base chunk can be extended to the end. */
            baseChunkSizeChars = currentChunk.length();
        }
        //sanitize the text and return a Chunk object, that includes the base chunk length.
        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
    }
    /**
     * Read the base chunk from the reader, and attempt to break at whitespace.
     *
     * @throws IOException if there is a problem reading from the reader.
     */
    private void readBaseChunk() throws IOException {
        //read the chunk until the minimum base chunk size
        readHelper(MINIMUM_BASE_CHUNK_SIZE, false);
        //keep reading until the maximum base chunk size or white space is reached.
        whitespaceFound = false;
        readHelper(MAXIMUM_BASE_CHUNK_SIZE, true);
    }
    /**
     * Read the window from the reader, and attempt to break at whitespace.
     *
     * @throws IOException if there is a problem reading from the reader.
     */
    private void readWindow() throws IOException {
        //read the window, leaving some room to look for white space to break at.
        int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
        readHelper(windowEnd, false);
        whitespaceFound = false;
        //keep reading until the max chunk size, or until whitespace is reached.
        windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024);
        readHelper(windowEnd, true);
    }
    /** Helper method that implements reading in a loop.
     *
     * @param maxBytes           The max cummulative length of the content,in
     *                           bytes, to read from the Reader. That is, when
     *                           chunkSizeBytes >= maxBytes stop reading.
     * @param inWhiteSpaceBuffer Should the current read stop once whitespace is
     *                           found?
     *
     * @throws IOException If there is a problem reading from the Reader.
     */
    private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException {
        //only read one character at a time if we are looking for whitespace.
        final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE;
        //read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader.
        while ((chunkSizeBytes < maxBytes)
                && (false == (inWhiteSpaceBuffer && whitespaceFound))
                && (endOfReaderReached == false)) {
            charsRead = reader.read(tempChunkBuf, 0, readSize);
            if (-1 == charsRead) {
                //this is the last chunk
                endOfReaderReached = true;
            } else {
                if (inWhiteSpaceBuffer) {
                    //chec for whitespace.
                    whitespaceFound = Character.isWhitespace(tempChunkBuf[0]);
                }
                //add read chars to the chunk and update the length.
                String chunkSegment = new String(tempChunkBuf, 0, charsRead);
                chunkSizeBytes += Utf8.encodedLength(chunkSegment);
                currentChunk.append(chunkSegment);
            }
        }
    }
 }
 /**
 * Represents one chunk as the text in it and the length of the base chunk, in
 * chars.
 */
 class Chunk {
    private final StringBuilder sb;
    private final int chunksize;
    Chunk(StringBuilder sb, int baseChunkLength) {
        this.sb = sb;
        this.chunksize = baseChunkLength;
    }
    @Override
    public String toString() {
        return sb.toString();
    }
    int getBaseChunkLength() {
        return chunksize;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -107,7 +107,7 @@ class StringsTextExtractor extends FileTextExtractor {
    }
    @Override
-    public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
+    public InputStreamReader getReader(AbstractFile sourceFile) throws TextExtractorException {
        InputStream stringStream = getInputStream(sourceFile);
        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -30,7 +30,6 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 */
 interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
    /**
     * Is this extractor configured such that no extraction will/should be done?
     *
@ -46,7 +45,6 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     */
    abstract void logWarning(String msg, Exception ex);
    /**
     * Get a reader that over the text extracted from the given source.
     *
@ -57,7 +55,7 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    abstract Reader getReader(TextSource source) throws Ingester.IngesterException;
+    abstract Reader getReader(TextSource source) throws TextExtractorException;
    /**
     * Get the 'object' id of the given source.
@ -76,4 +74,15 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     * @return
     */
    abstract String getName(TextSource source);
    class TextExtractorException extends Exception {
        public TextExtractorException(String message) {
            super(message);
        }
        public TextExtractorException(String message, Throwable cause) {
            super(message, cause);
        }
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -36,7 +36,6 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@ -67,7 +66,7 @@ class TikaTextExtractor extends FileTextExtractor {
    }
    @Override
-    public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+    public Reader getReader(AbstractFile sourceFile) throws TextExtractorException, MissingResourceException {
        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
        Metadata metadata = new Metadata();
@ -81,12 +80,12 @@ class TikaTextExtractor extends FileTextExtractor {
        } catch (TimeoutException te) {
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
            logWarning(msg, te);
-            throw new IngesterException(msg);
+            throw new TextExtractorException(msg, te);
        } catch (Exception ex) {
            KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
            logWarning(msg, ex);
-            throw new IngesterException(msg, ex);
+            throw new TextExtractorException(msg, ex);
        }
    }