Merge branch 'develop' of https://github.com/sleuthkit/autopsy into 2197-ProfileOptionsPanel

2025-07-16 09:47:42 +00:00 · 2017-01-11 14:39:30 -05:00 · 2017-01-11 14:39:30 -05:00 · 35bf21eefe
commit 35bf21eefe
parent decccc38f6 aa4474d54d
11 changed files with 468 additions and 247 deletions
--- a/Core/src/org/sleuthkit/autopsy/datamodel/ImageNode.java
+++ b/Core/src/org/sleuthkit/autopsy/datamodel/ImageNode.java
@ -19,6 +19,8 @@
 package org.sleuthkit.autopsy.datamodel;

 import java.awt.event.ActionEvent;
+import java.beans.PropertyChangeEvent;
+import java.beans.PropertyChangeListener;
 import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.util.ArrayList;
@ -27,6 +29,7 @@ import java.util.List;
 import java.util.logging.Level;
 import javax.swing.AbstractAction;
 import javax.swing.Action;
+import org.openide.nodes.Children;
 import org.openide.nodes.Sheet;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
@ -35,11 +38,14 @@ import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.directorytree.ExplorerNodeActionVisitor;
 import org.sleuthkit.autopsy.directorytree.FileSearchAction;
 import org.sleuthkit.autopsy.directorytree.NewWindowViewAction;
+import org.sleuthkit.autopsy.ingest.IngestManager;
+import org.sleuthkit.autopsy.ingest.ModuleContentEvent;
 import org.sleuthkit.autopsy.ingest.RunIngestModulesDialog;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.Image;
 import org.sleuthkit.datamodel.SleuthkitCase.CaseDbQuery;
 import org.sleuthkit.datamodel.TskCoreException;
+import org.sleuthkit.datamodel.VirtualDirectory;

 /**
 * This class is used to represent the "Node" for the image. The children of
@ -71,6 +77,16 @@ public class ImageNode extends AbstractContentNode<Image> {
        String imgName = nameForImage(img);
        this.setDisplayName(imgName);
        this.setIconBaseWithExtension("org/sleuthkit/autopsy/images/hard-drive-icon.jpg"); //NON-NLS
+        
+        // Listen for ingest events so that we can detect new added files (e.g. carved)
+        IngestManager.getInstance().addIngestModuleEventListener(pcl);        
+        // Listen for case events so that we can detect when case is closed
+        Case.addPropertyChangeListener(pcl);
+    }
+
+    private void removeListeners() {
+        IngestManager.getInstance().removeIngestModuleEventListener(pcl);
+        Case.removePropertyChangeListener(pcl);
    }

    /**
@ -199,4 +215,46 @@ public class ImageNode extends AbstractContentNode<Image> {
    public String getItemType() {
        return getClass().getName();
    }
+    
+    private final PropertyChangeListener pcl = (PropertyChangeEvent evt) -> {
+        String eventType = evt.getPropertyName();
+
+        // See if the new file is a child of ours
+        if (eventType.equals(IngestManager.IngestModuleEvent.CONTENT_CHANGED.toString())) {
+            if ((evt.getOldValue() instanceof ModuleContentEvent) == false) {
+                return;
+            }
+            ModuleContentEvent moduleContentEvent = (ModuleContentEvent) evt.getOldValue();
+            if ((moduleContentEvent.getSource() instanceof Content) == false) {
+                return;
+            }
+            Content newContent = (Content) moduleContentEvent.getSource();
+
+            try {
+                Content parent = newContent.getParent();
+                if (parent != null) {
+                    // Is this a new carved file?
+                    if (parent.getName().equals(VirtualDirectory.NAME_CARVED)) {
+                        // Was this new carved file produced from this image?
+                        if (parent.getParent().getId() == getContent().getId()) {
+                            Children children = getChildren();
+                            if (children != null) {
+                                ((ContentChildren) children).refreshChildren();
+                                children.getNodesCount();
+                            }
+                        }
+                    }
+                }
+            } catch (TskCoreException ex) {
+                // Do nothing.
+            }
+        } else if (eventType.equals(Case.Events.CURRENT_CASE.toString())) {
+            if (evt.getNewValue() == null) {
+                // case was closed. Remove listeners so that we don't get called with a stale case handle
+                removeListeners();
+            }
+        }
+    };
+    
+    
 }
--- a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/Bundle.properties
+++ b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/Bundle.properties
@ -82,7 +82,13 @@ ConfirmationDialog.Exit=Exit
 ConfirmationDialog.DoNotExit=Do Not Exit
 ConfirmationDialog.ConfirmExit=All incomplete copy jobs will be cancelled. Are you sure?
 ConfirmationDialog.ConfirmExitHeader=Confirm Exit
+OpenIDE-Module-Long-Description=\
+    This module contains features that are being developed by Basis Technology and are not part of the default Autopsy distribution.  \
+    You can enable this module to use the new features.  \
+    The features should be stable, but their exact behavior and API are subject to change.  \n\n\
+    We make no guarantee that the API of this module will not change, so developers should be careful when relying on it.
 OpenIDE-Module-Name=Experimental
+OpenIDE-Module-Short-Description=This module contains features that are being developed by Basis Technology and are not part of the default Autopsy distribution.
 ReviewModeCasePanel.bnRefresh.text=&Refresh
 ReviewModeCasePanel.bnOpen.text=&Open
 ReviewModeCasePanel.rbGroupLabel.text=Show Last 10:
--- a/KeywordSearch/release/solr/solr/configsets/AutopsyConfig/conf/schema.xml
+++ b/KeywordSearch/release/solr/solr/configsets/AutopsyConfig/conf/schema.xml
@ -526,6 +526,7 @@
   <!-- file chunk-specific fields (optional for others) -->
   <!-- for a parent file with no content, number of chunks are specified -->
   <field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
+   <field name="chunk_size" type="int" indexed="true" stored="true" required="false" />
   
   <!-- Common metadata fields, named specifically to match up with
     SolrCell metadata when parsing rich documents such as Word, PDF.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskCoreException;
 * Extracts text from artifacts by concatenating the values of all of the
 * artifact's attributes.
 */
-public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
+class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());

    /**
@ -82,13 +82,16 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
    }

    @Override
-    boolean isDisabled() {
+     public boolean isDisabled() {
        return false;
+     }
+
+     @Override
+     public void logWarning(final String msg, Exception ex) {
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }

-
-    @Override
-    InputStream getInputStream(BlackboardArtifact artifact) {
+    private InputStream getInputStream(BlackboardArtifact artifact) {
        // Concatenate the string values of all attributes into a single
        // "content" string to be indexed.
        StringBuilder artifactContents = new StringBuilder();
@ -127,17 +130,17 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
    }

    @Override
-    Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
-        return new InputStreamReader(stream, StandardCharsets.UTF_8);
+    public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
+        return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
    }

    @Override
-    long getID(BlackboardArtifact source) {
+    public long getID(BlackboardArtifact source) {
        return source.getArtifactID();
    }

    @Override
-    String getName(BlackboardArtifact source) {
+    public String getName(BlackboardArtifact source) {
        return source.getDisplayName() + "_" + source.getArtifactID();
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -18,7 +18,6 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import java.io.InputStream;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
@ -28,7 +27,7 @@ import org.sleuthkit.datamodel.AbstractFile;
 * Common methods for utilities that extract text and content and divide into
 * chunks
 */
-abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
+abstract class FileTextExtractor implements TextExtractor< AbstractFile> {


    static final List<String> BLOB_MIME_TYPES
@ -96,17 +95,16 @@ abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
    abstract boolean isSupported(AbstractFile file, String detectedFormat);

    @Override
-    abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
+    public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;

    @Override
-    long getID(AbstractFile source) {
+    public long getID(AbstractFile source) {
        return source.getId();
    }


    @Override
-    String getName(AbstractFile source) {
+    public String getName(AbstractFile source) {
        return source.getName();
    }
-
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -19,26 +19,28 @@
 package org.sleuthkit.autopsy.keywordsearch;

 import java.io.IOException;
-import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
+import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Renderer;
 import net.htmlparser.jericho.Source;
 import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;

 /**
 * Extractor of text from HTML supported AbstractFile content. Extracted text
- * will be * divided into chunks and indexed with Solr. If HTML extraction succeeds,
- * chunks are indexed with Solr.
+ * will be * divided into chunks and indexed with Solr. If HTML extraction
+ * succeeds, chunks are indexed with Solr.
 */
 class HtmlTextExtractor extends FileTextExtractor {

+    static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
    private static final int MAX_SIZE = 50_000_000; //50MB

    static final List<String> WEB_MIME_TYPES = Arrays.asList(
@ -63,7 +65,9 @@ class HtmlTextExtractor extends FileTextExtractor {
    }

    @Override
-    Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
+    public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
+        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
+
        //Parse the stream with Jericho and put the results in a Reader
        try {
            StringBuilder scripts = new StringBuilder();
@ -77,7 +81,7 @@ class HtmlTextExtractor extends FileTextExtractor {
            int numComments = 0;
            int numOthers = 0;

-            Source source = new Source(in);
+            Source source = new Source(stream);
            source.fullSequentialParse();
            Renderer renderer = source.getRenderer();
            renderer.setNewLine("\n");
@ -160,12 +164,11 @@ class HtmlTextExtractor extends FileTextExtractor {
    }

    @Override
-    InputStream getInputStream(AbstractFile sourceFile1) {
-        return new ReadContentInputStream(sourceFile1);
-    }
-
-    @Override
-    boolean isDisabled() {
+    public boolean isDisabled() {
        return false;
    }
+
+    public void logWarning(final String msg, Exception ex) {
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
+    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -18,12 +18,15 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

+import com.google.common.base.Utf8;
+import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Map;
+import java.util.NoSuchElementException;
 import java.util.logging.Level;
+import javax.annotation.concurrent.NotThreadSafe;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.common.SolrInputDocument;
 import org.openide.util.NbBundle;
@ -55,9 +58,7 @@ class Ingester {
    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
    private static Ingester instance;

-    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
-    private static final int SINGLE_READ_CHARS = 1024;
-    private static final int EXTRA_CHARS = 128;
+    private static final int SINGLE_READ_CHARS = 512;

    private Ingester() {
    }
@ -121,6 +122,136 @@ class Ingester {
        return item.accept(SOLR_FIELDS_VISITOR);
    }

+    /**
+     * Use the given TextExtractor to extract text from the given source. The
+     * text will be chunked and each chunk passed to Solr to add to the index.
+     *
+     *
+     * @param <A>       The type of the Appendix provider that provides
+     *                  additional text to append to the final chunk.
+     * @param <T>       A subclass of SleuthkitVisibleItem.
+     * @param extractor The TextExtractor that will be used to extract text from
+     *                  the given source.
+     * @param source    The source from which text will be extracted, chunked,
+     *                  and indexed.
+     * @param context   The ingest job context that can be used to cancel this
+     *                  process.
+     *
+     * @return True if this method executed normally. or False if there was an
+     *         unexpected exception. //JMTODO: This policy needs to be reviewed.
+     *
+     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+     */
+    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
+        final long sourceID = extractor.getID(source);
+        final String sourceName = extractor.getName(source);
+
+        int numChunks = 0; //unknown until chunking is done
+
+        if (extractor.isDisabled()) {
+            /* some Extrctors, notable the strings extractor, have options which
+             * can be configured such that no extraction should be done */
+            return true;
+        }
+
+        Map<String, String> fields = getContentFields(source);
+        //Get a reader for the content of the given source
+        try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
+            Chunker chunker = new Chunker(reader);
+            for (Chunk chunk : chunker) {
+                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
+                fields.put(Server.Schema.ID.toString(), chunkId);
+                fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
+                try {
+                    //add the chunk text to Solr index
+                    indexChunk(chunk.toString(), sourceName, fields);
+                    numChunks++;
+                } catch (Ingester.IngesterException ingEx) {
+                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
+
+                    throw ingEx; //need to rethrow to signal error and move on
+                } catch (Exception ex) {
+                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
+                }
+            }
+        } catch (IOException ex) {
+            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
+            return false;
+        } catch (Exception ex) {
+            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
+            return false;
+        } finally {
+            //after all chunks, index just the meta data, including the  numChunks, of the parent file
+            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
+            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
+            indexChunk(null, sourceName, fields);
+        }
+
+        return true;
+    }
+
+    /**
+     * Add one chunk as to the Solr index as a seperate sold document.
+     *
+     * TODO see if can use a byte or string streaming way to add content to
+     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
+     * 4.0.0), see if possible to stream with UpdateRequestHandler
+     *
+     * @param chunk  The chunk content as a string
+     * @param fields
+     * @param size
+     *
+     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+     */
+    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
+        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
+            //JMTODO: actually if the we couldn't get the image id it is set to -1,
+            // but does this really mean we don't want to index it?
+
+            //skip the file, image id unknown
+            //JMTODO: does this need to ne internationalized?
+            String msg = NbBundle.getMessage(Ingester.class,
+                    "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
+            logger.log(Level.SEVERE, msg);
+            throw new IngesterException(msg);
+        }
+
+        //Make a SolrInputDocument out of the field map
+        SolrInputDocument updateDoc = new SolrInputDocument();
+        for (String key : fields.keySet()) {
+            updateDoc.addField(key, fields.get(key));
+        }
+        //add the content to the SolrInputDocument
+        //JMTODO: can we just add it to the field map before passing that in?
+        updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
+
+        try {
+            //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
+            solrServer.addDocument(updateDoc);
+            uncommitedIngests = true;
+
+        } catch (KeywordSearchModuleException ex) {
+            //JMTODO: does this need to ne internationalized?
+            throw new IngesterException(
+                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
+        }
+    }
+
+    /**
+     * Tells Solr to commit (necessary before ingested files will appear in
+     * searches)
+     */
+    void commit() {
+        try {
+            solrServer.commit();
+            uncommitedIngests = false;
+        } catch (NoOpenCoreException | SolrServerException ex) {
+            logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
+
+        }
+    }
+
    /**
     * Visitor used to create fields to send to SOLR index.
     */
@ -222,192 +353,6 @@ class Ingester {
        }
    }

-    /**
-     * Use the given TextExtractor to extract text from the given source. The
-     * text will be chunked and each chunk passed to Solr to add to the index.
-     *
-     *
-     * @param <A>       The type of the Appendix provider that provides
-     *                  additional text to append to the final chunk.
-     * @param <T>       A subclass of SleuthkitVisibleItem.
-     * @param extractor The TextExtractor that will be used to extract text from
-     *                  the given source.
-     * @param source    The source from which text will be extracted, chunked,
-     *                  and indexed.
-     * @param context   The ingest job context that can be used to cancel this
-     *                  process.
-     *
-     * @return True if this method executed normally. or False if there was an
-     *         unexpected exception. //JMTODO: This policy needs to be reviewed.
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-     */
-    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
-        final long sourceID = extractor.getID(source);
-        final String sourceName = extractor.getName(source);
-
-        int numChunks = 0; //unknown until chunking is done
-
-        if (extractor.isDisabled()) {
-            /* some Extrctors, notable the strings extractor, have options which
-             * can be configured such that no extraction should be done */
-            return true;
-        }
-
-        Map<String, String> fields = getContentFields(source);
-        //Get a stream and a reader for that stream
-        try (final InputStream stream = extractor.getInputStream(source);
-                Reader reader = extractor.getReader(stream, source);) {
-
-            //we read max 1024 chars at time, this seems to max what some Readers would return
-            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
-
-            boolean eof = false;  //have we read until the end of the file yet
-            while (!eof) {
-                int chunkSizeInChars = 0;  // the size in chars of the chunk (so far)
-                if (context != null && context.fileIngestIsCancelled()) {
-                    return true;
-                }
-                long charsRead = 0;  // number of chars read in the most recent read operation
-                //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
-                while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
-                        && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
-                    chunkSizeInChars += charsRead;
-                }
-
-                if (charsRead == -1) {
-                    //this is the last chunk
-                    eof = true;
-                } else {
-                    chunkSizeInChars += charsRead;
-
-                    //if we haven't reached the end of the file,
-                    //try to read char-by-char until whitespace to not break words
-                    while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
-                            && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
-                            && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
-                        chunkSizeInChars += charsRead;
-                    }
-                    if (charsRead == -1) {
-                        //this is the last chunk
-                        eof = true;
-                    }
-                }
-
-                StringBuilder sb = new StringBuilder(chunkSizeInChars)
-                        .append(textChunkBuf, 0, chunkSizeInChars);
-
-                sanitizeToUTF8(sb);   //replace non UTF8 chars with '^'
-
-                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
-                fields.put(Server.Schema.ID.toString(), chunkId);
-                try {
-                    //pass the chunk to method that adds it to Solr index
-                    indexChunk(sb.toString(), sourceName, fields);
-                    numChunks++;
-                } catch (Ingester.IngesterException ingEx) {
-                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
-                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
-
-                    throw ingEx; //need to rethrow to signal error and move on
-                } catch (Exception ex) {
-                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
-                }
-            }
-        } catch (IOException ex) {
-            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
-            return false;
-        } catch (Exception ex) {
-            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
-            return false;
-        } finally {
-            //after all chunks, index just the meta data, including the  numChunks, of the parent file
-            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
-            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
-            indexChunk(null, sourceName, fields);
-        }
-        return true;
-    }
-
-    /**
-     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
-     * caret '^'
-     *
-     * @param sb the StringBuilder to sanitize
-     *
-     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
-     * function?
-     */
-    private static void sanitizeToUTF8(StringBuilder sb) {
-        final int length = sb.length();
-
-        // Sanitize by replacing non-UTF-8 characters with caret '^'
-        for (int i = 0; i < length; i++) {
-            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
-                sb.replace(i, i + 1, "^");
-            }
-        }
-    }
-
-    /**
-     * Add one chunk as to the Solr index as a seperate sold document.
-     *
-     * TODO see if can use a byte or string streaming way to add content to
-     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
-     * 4.0.0), see if possible to stream with UpdateRequestHandler
-     *
-     * @param chunk  The chunk content as a string
-     * @param fields
-     * @param size
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-     */
-    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
-        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
-            //JMTODO: actually if the we couldn't get the image id it is set to -1,
-            // but does this really mean we don't want to index it?
-
-            //skip the file, image id unknown
-            //JMTODO: does this need to ne internationalized?
-            String msg = NbBundle.getMessage(Ingester.class,
-                    "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
-            logger.log(Level.SEVERE, msg);
-            throw new IngesterException(msg);
-        }
-
-        //Make a SolrInputDocument out of the field map
-        SolrInputDocument updateDoc = new SolrInputDocument();
-        for (String key : fields.keySet()) {
-            updateDoc.addField(key, fields.get(key));
-        }
-        //add the content to the SolrInputDocument
-        //JMTODO: can we just add it to the field map before passing that in?
-        updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
-
-        try {
-            //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
-            solrServer.addDocument(updateDoc);
-            uncommitedIngests = true;
-        } catch (KeywordSearchModuleException ex) {
-            //JMTODO: does this need to ne internationalized?
-            throw new IngesterException(
-                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
-        }
-    }
-
-    /**
-     * Tells Solr to commit (necessary before ingested files will appear in
-     * searches)
-     */
-    void commit() {
-        try {
-            solrServer.commit();
-            uncommitedIngests = false;
-        } catch (NoOpenCoreException | SolrServerException ex) {
-            logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
-        }
-    }
-
    /**
     * Indicates that there was an error with the specific ingest operation, but
     * it's still okay to continue ingesting files.
@ -425,3 +370,211 @@ class Ingester {
        }
    }
 }
+
+/**
+ * Encapsulates the content chunking algorithm in an implementation of the
+ * Iterator interface. Also implements Iterable so it can be used directly in a
+ * for loop. The base chunk is the part of the chunk before the overlapping
+ * window. The window will be included at the end of the current chunk as well
+ * as at the beginning of the next chunk.
+ */
+@NotThreadSafe
+class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
+
+    //Chunking algorithm paramaters-------------------------------------//
+    /** the maximum size of a chunk, including the window. */
+    private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
+    /** the minimum to read before we start the process of looking for
+     * whitespace to break at and creating an overlapping window. */
+    private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
+    /** The maximum size of the chunk, before the overlapping window, even if we
+     * couldn't find whitespace to break at. */
+    private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
+    /** The amount of text we will read through before we give up on finding
+     * whitespace to break the chunk/window at. */
+    private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
+    /** The number of characters to read in one go from the Reader. */
+    private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
+
+    ////chunker state--------------------------------------------///
+    /** The Reader that this chunk reads from, and divides into chunks. It must
+     * be a buffered reader to ensure that mark/reset are supported. */
+    private final BufferedReader reader;
+    /** The local buffer of characters read from the Reader. */
+    private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
+    /** number of chars read in the most recent read operation. */
+    private int charsRead = 0;
+
+    /** The text of the current chunk (so far). */
+    private StringBuilder currentChunk;
+    /** the size in bytes of the chunk (so far). */
+    private int chunkSizeBytes = 0;
+    /** the size in chars of the (base) chunk (so far). */
+    private int baseChunkSizeChars;
+
+    /** has the chunker found whitespace to break on? */
+    private boolean whitespaceFound = false;
+    /** has the chunker reached the end of the Reader? If so, there are no more
+     * chunks, and the current chunk does not need a window. */
+    private boolean endOfReaderReached = false;
+
+    /**
+     * Create a Chunker that will chunk the content of the given Reader.
+     *
+     * @param reader The content to chunk.
+     */
+    Chunker(BufferedReader reader) {
+        this.reader = reader;
+    }
+
+    @Override
+    public Iterator<Chunk> iterator() {
+        return this;
+    }
+
+    @Override
+    public boolean hasNext() {
+        return endOfReaderReached == false;
+    }
+
+    /**
+     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
+     * caret '^'
+     *
+     * @param sb the StringBuilder to sanitize
+     *
+     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
+     * function?
+     */
+    private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
+        final int length = sb.length();
+        for (int i = 0; i < length; i++) {
+            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
+                sb.replace(i, i + 1, "^");
+            }
+        }
+        return sb;
+    }
+
+    @Override
+    public Chunk next() {
+        if (endOfReaderReached) {
+            throw new NoSuchElementException("There are no more chunks.");
+        }
+        //reset state for the next chunk
+        currentChunk = new StringBuilder();
+        chunkSizeBytes = 0;
+        baseChunkSizeChars = 0;
+
+        try {
+            readBaseChunk();
+            baseChunkSizeChars = currentChunk.length();
+            reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk
+            readWindow();
+        } catch (IOException ioEx) {
+            throw new RuntimeException("IOException while reading chunk.", ioEx);
+        }
+        try {
+            reader.reset(); //reset the reader the so the next chunk can begin at the position marked above
+        } catch (IOException ex) {
+            throw new RuntimeException("IOException while resetting chunk reader.", ex);
+        }
+
+        if (endOfReaderReached) {
+            /* if we have reached the end of the content,we won't make another
+             * overlapping chunk, so the base chunk can be extended to the end. */
+            baseChunkSizeChars = currentChunk.length();
+        }
+        //sanitize the text and return a Chunk object, that includes the base chunk length.
+        return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
+    }
+
+    /**
+     * Read the base chunk from the reader, and attempt to break at whitespace.
+     *
+     * @throws IOException if there is a problem reading from the reader.
+     */
+    private void readBaseChunk() throws IOException {
+        //read the chunk until the minimum base chunk size
+        readHelper(MINIMUM_BASE_CHUNK_SIZE, false);
+        //keep reading until the maximum base chunk size or white space is reached.
+        whitespaceFound = false;
+        readHelper(MAXIMUM_BASE_CHUNK_SIZE, true);
+
+    }
+
+    /**
+     * Read the window from the reader, and attempt to break at whitespace.
+     *
+     * @throws IOException if there is a problem reading from the reader.
+     */
+    private void readWindow() throws IOException {
+        //read the window, leaving some room to look for white space to break at.
+        int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
+        readHelper(windowEnd, false);
+        whitespaceFound = false;
+        //keep reading until the max chunk size, or until whitespace is reached.
+        windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024);
+        readHelper(windowEnd, true);
+    }
+
+    /** Helper method that implements reading in a loop.
+     *
+     * @param maxBytes           The max cummulative length of the content,in
+     *                           bytes, to read from the Reader. That is, when
+     *                           chunkSizeBytes >= maxBytes stop reading.
+     * @param inWhiteSpaceBuffer Should the current read stop once whitespace is
+     *                           found?
+     *
+     * @throws IOException If there is a problem reading from the Reader.
+     */
+    private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException {
+        //only read one character at a time if we are looking for whitespace.
+        final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE;
+
+        //read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader.
+        while ((chunkSizeBytes < maxBytes)
+                && (false == (inWhiteSpaceBuffer && whitespaceFound))
+                && (endOfReaderReached == false)) {
+            charsRead = reader.read(tempChunkBuf, 0, readSize);
+            if (-1 == charsRead) {
+                //this is the last chunk
+                endOfReaderReached = true;
+            } else {
+                if (inWhiteSpaceBuffer) {
+                    //chec for whitespace.
+                    whitespaceFound = Character.isWhitespace(tempChunkBuf[0]);
+                }
+
+                //add read chars to the chunk and update the length.
+                String chunkSegment = new String(tempChunkBuf, 0, charsRead);
+                chunkSizeBytes += Utf8.encodedLength(chunkSegment);
+                currentChunk.append(chunkSegment);
+            }
+        }
+    }
+}
+
+/**
+ * Represents one chunk as the text in it and the length of the base chunk, in
+ * chars.
+ */
+class Chunk {
+
+    private final StringBuilder sb;
+    private final int chunksize;
+
+    Chunk(StringBuilder sb, int baseChunkLength) {
+        this.sb = sb;
+        this.chunksize = baseChunkLength;
+    }
+
+    @Override
+    public String toString() {
+        return sb.toString();
+    }
+
+    int getBaseChunkLength() {
+        return chunksize;
+    }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
@ -149,6 +149,12 @@ public class Server {
                return "num_chunks"; //NON-NLS
            }
        },
+        CHUNK_SIZE {
+            @Override
+            public String toString() {
+                return "chunk_size"; //NON-NLS
+            }
+        }
    };

    public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -25,6 +25,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@ -37,6 +38,8 @@ import org.sleuthkit.datamodel.TskException;
 */
 class StringsTextExtractor extends FileTextExtractor {

+    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
+
    /**
     * Options for this extractor
     */
@ -91,7 +94,12 @@ class StringsTextExtractor extends FileTextExtractor {
    }

    @Override
-    boolean isDisabled() {
+    public void logWarning(final String msg, Exception ex) {
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
+    }
+
+    @Override
+    public boolean isDisabled() {
        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));

@ -99,11 +107,11 @@ class StringsTextExtractor extends FileTextExtractor {
    }

    @Override
-    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
+    public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
+        InputStream stringStream = getInputStream(sourceFile);
        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
    }

-    @Override
    InputStream getInputStream(AbstractFile sourceFile) {
        //check which extract stream to use
        if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -18,10 +18,7 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import java.io.InputStream;
 import java.io.Reader;
-import java.util.logging.Level;
-import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;

 /**
@ -31,9 +28,8 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 * @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
 *                     is able to process.
 */
-abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
+interface TextExtractor< TextSource extends SleuthkitVisitableItem> {

-    static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());

    /**
     * Is this extractor configured such that no extraction will/should be done?
@ -48,18 +44,8 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
     * @param msg
     * @param ex
     */
-    void logWarning(String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
+    abstract void logWarning(String msg, Exception ex);

-    /**
-     * Get an input stream over the content of the given source.
-     *
-     * @param source
-     *
-     * @return
-     */
-    abstract InputStream getInputStream(TextSource source);

    /**
     * Get a reader that over the text extracted from the given source.
@ -71,7 +57,7 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
+    abstract Reader getReader(TextSource source) throws Ingester.IngesterException;

    /**
     * Get the 'object' id of the given source.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch;

 import com.google.common.io.CharSource;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.Reader;
 import java.util.List;
 import java.util.MissingResourceException;
@ -36,6 +35,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@ -51,22 +51,25 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 */
 class TikaTextExtractor extends FileTextExtractor {

+    static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();

    private static final List<String> TIKA_SUPPORTED_TYPES
            = new Tika().getParser().getSupportedTypes(new ParseContext())
-            .parallelStream()
+            .stream()
            .map(mt -> mt.getType() + "/" + mt.getSubtype())
            .collect(Collectors.toList());

    @Override
-    void logWarning(final String msg, Exception ex) {
+    public void logWarning(final String msg, Exception ex) {
        KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-        super.logWarning(msg, ex);
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }

    @Override
-    Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+    public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
+
        Metadata metadata = new Metadata();
        //Parse the file in a task, a convenient way to have a timeout...
        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
@ -125,13 +128,9 @@ class TikaTextExtractor extends FileTextExtractor {
        return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
    }

-    @Override
-    InputStream getInputStream(AbstractFile sourceFile1) {
-        return new ReadContentInputStream(sourceFile1);
-    }

    @Override
-    boolean isDisabled() {
+    public boolean isDisabled() {
        return false;
    }