From d5e7c520c969d294fddc6a48ecfe05ae1c0c7787 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 30 Nov 2016 10:48:30 +0100
Subject: [PATCH 01/21] RegressionTest sets datasource time zone to
 "America/New_York"

---
 .../src/org/sleuthkit/autopsy/testing/RegressionTest.java    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java
index 12e04aad94..33d73c2044 100755
--- a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java
+++ b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java
@@ -35,7 +35,6 @@ import java.util.logging.Level;
 import java.util.logging.Logger;
 import javax.imageio.ImageIO;
 import javax.swing.JDialog;
-import javax.swing.JLabel;
 import javax.swing.JTextField;
 import junit.framework.Test;
 import junit.framework.TestCase;
@@ -50,10 +49,10 @@ import org.netbeans.jemmy.operators.JComboBoxOperator;
 import org.netbeans.jemmy.operators.JDialogOperator;
 import org.netbeans.jemmy.operators.JFileChooserOperator;
 import org.netbeans.jemmy.operators.JLabelOperator;
+import org.netbeans.jemmy.operators.JListOperator;
 import org.netbeans.jemmy.operators.JTabbedPaneOperator;
 import org.netbeans.jemmy.operators.JTableOperator;
 import org.netbeans.jemmy.operators.JTextFieldOperator;
-import org.netbeans.jemmy.operators.JListOperator;
 import org.netbeans.junit.NbModuleSuite;
 import org.sleuthkit.autopsy.ingest.IngestManager;
 
@@ -186,6 +185,8 @@ public class RegressionTest extends TestCase {
         String img_path = getEscapedPath(System.getProperty("img_path"));
         String imageDir = img_path;
         ((JTextField) jtfo0.getSource()).setText(imageDir);
+        JComboBoxOperator comboBoxOperator = new JComboBoxOperator(wo, 1);
+        comboBoxOperator.setSelectedItem("(GMT-5:00) America/New_York");
         wo.btNext().clickMouse();
     }
 

From 0f1f8b22116a1e388d3fa69b8cce350a06497169 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Mon, 12 Dec 2016 15:41:24 +0100
Subject: [PATCH 02/21] refactor common chunking algorithm into
 TextExtractorBase, remove AbstractFileChunk

---
 .../keywordsearch/AbstractFileChunk.java      |  91 ---
 .../keywordsearch/ByteContentStream.java      |  18 +-
 .../keywordsearch/HtmlTextExtractor.java      | 295 +++++-----
 .../autopsy/keywordsearch/Ingester.java       | 171 ++----
 .../KeywordSearchIngestModule.java            |   4 +-
 .../keywordsearch/StringsTextExtractor.java   | 553 ++++++++++++++----
 .../autopsy/keywordsearch/TextExtractor.java  |  63 +-
 .../keywordsearch/TextExtractorBase.java      | 149 +++++
 .../keywordsearch/TikaTextExtractor.java      | 296 ++--------
 9 files changed, 850 insertions(+), 790 deletions(-)
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java
 create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java
deleted file mode 100644
index 5253e5e240..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-2016 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.nio.charset.Charset;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
-
-/**
- * A representation of a chunk of text from a file that can be used, when
- * supplied with an Ingester, to index the chunk for search.
- */
-final class AbstractFileChunk {
-
-    private final int chunkNumber;
-    private final TextExtractor textExtractor;
-
-    /**
-     * Constructs a representation of a chunk of text from a file that can be
-     * used, when supplied with an Ingester, to index the chunk for search.
-     *
-     * @param textExtractor A TextExtractor for the file.
-     * @param chunkNumber   A sequence number for the chunk.
-     */
-    AbstractFileChunk(TextExtractor textExtractor, int chunkNumber) {
-        this.textExtractor = textExtractor;
-        this.chunkNumber = chunkNumber;
-    }
-
-    /**
-     * Gets the TextExtractor for the source file of the text chunk.
-     *
-     * @return A reference to the TextExtractor.
-     */
-    TextExtractor getTextExtractor() {
-        return textExtractor;
-    }
-
-    /**
-     * Gets the sequence number of the text chunk.
-     *
-     * @return The chunk number.
-     */
-    int getChunkNumber() {
-        return chunkNumber;
-    }
-
-    /**
-     * Gets the id of the text chunk.
-     *
-     * @return An id of the form [source file object id]_[chunk number]
-     */
-    String getChunkId() {
-        return Server.getChunkIdString(this.textExtractor.getSourceFile().getId(), this.chunkNumber);
-    }
-
-    /**
-     * Indexes the text chunk.
-     *
-     * @param ingester   An Ingester to do the indexing.
-     * @param chunkBytes The raw bytes of the text chunk.
-     * @param chunkSize  The size of the text chunk in bytes.
-     * @param charSet    The char set to use during indexing.
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-     */
-    void index(Ingester ingester, byte[] chunkBytes, long chunkSize, Charset charSet) throws IngesterException {
-        ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, textExtractor.getSourceFile(), charSet);
-        try {
-            ingester.ingest(this, bcs, chunkBytes.length);
-        } catch (Exception ex) {
-            throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", getChunkId()), ex);
-        }
-    }
-
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
index d5a19712c0..c39e9b7bb5 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
@@ -23,11 +23,9 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.nio.charset.Charset;
-
+import org.apache.solr.common.util.ContentStream;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.apache.solr.common.util.ContentStream;
 import org.sleuthkit.datamodel.AbstractContent;
 
 /**
@@ -37,19 +35,17 @@ import org.sleuthkit.datamodel.AbstractContent;
 class ByteContentStream implements ContentStream {
 
     //input
-    private byte[] content; //extracted subcontent
+    private final byte[] content; //extracted subcontent
     private long contentSize;
-    private AbstractContent aContent; //origin
-    private Charset charset; //output byte stream charset of encoded strings
+    private final AbstractContent aContent; //origin
 
-    private InputStream stream;
+    private final InputStream stream;
 
-    private static Logger logger = Logger.getLogger(ByteContentStream.class.getName());
+    private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
 
-    public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent, Charset charset) {
+    public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent) {
         this.content = content;
         this.aContent = aContent;
-        this.charset = charset;
         stream = new ByteArrayInputStream(content, 0, (int) contentSize);
     }
 
@@ -63,7 +59,7 @@ class ByteContentStream implements ContentStream {
 
     @Override
     public String getContentType() {
-        return "text/plain;charset=" + charset.name(); //NON-NLS
+        return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
     }
 
     @Override
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
index 27e9ccd637..6e8a57e258 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2012-2013 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,15 +21,16 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.nio.charset.Charset;
+import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
-import java.util.Map;
 import java.util.logging.Level;
+import net.htmlparser.jericho.Attributes;
+import net.htmlparser.jericho.Renderer;
+import net.htmlparser.jericho.Source;
+import net.htmlparser.jericho.StartTag;
+import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
@@ -38,19 +39,12 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * divided into chunks and indexed with Solr. If HTML extraction succeeds,
  * chunks are indexed with Solr.
  */
-class HtmlTextExtractor implements TextExtractor {
+class HtmlTextExtractor extends TextExtractorBase<Void> {
 
     private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
-    private static Ingester ingester;
-    static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
+
     static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
-    private static final int SINGLE_READ_CHARS = 1024;
-    private static final int EXTRA_CHARS = 128; //for whitespace    
     private static final int MAX_SIZE = 50000000;
-    //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
-    private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
-    private AbstractFile sourceFile;
-    private int numChunks = 0;
 
     static final List<String> WEB_MIME_TYPES = Arrays.asList(
             "application/javascript", //NON-NLS
@@ -59,154 +53,14 @@ class HtmlTextExtractor implements TextExtractor {
             "text/css", //NON-NLS
             "text/html", //NON-NLS NON-NLS
             "text/javascript" //NON-NLS
-    //"application/xml",
-    //"application/xml-dtd",
     );
 
     HtmlTextExtractor() {
-        ingester = Server.getIngester();
     }
 
     @Override
-    public boolean setScripts(List<SCRIPT> extractScripts) {
-        return false;
-    }
-
-    @Override
-    public List<SCRIPT> getScripts() {
-        return null;
-    }
-
-    @Override
-    public Map<String, String> getOptions() {
-        return null;
-    }
-
-    @Override
-    public void setOptions(Map<String, String> options) {
-    }
-
-    @Override
-    public int getNumChunks() {
-        return numChunks;
-    }
-
-    @Override
-    public AbstractFile getSourceFile() {
-        return sourceFile;
-    }
-
-    @Override
-    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
-        this.sourceFile = sourceFile;
-        numChunks = 0; //unknown until indexing is done
-
-        boolean success = false;
-        Reader reader = null;
-
-        final InputStream stream = new ReadContentInputStream(sourceFile);
-
-        try {
-            // Parse the stream with Jericho
-            JerichoParserWrapper jpw = new JerichoParserWrapper(stream);
-            jpw.parse();
-            reader = jpw.getReader();
-
-            // In case there is an exception or parse() isn't called
-            if (reader == null) {
-                logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS
-                return false;
-            }
-
-            success = true;
-            long readSize;
-            long totalRead = 0;
-            boolean eof = false;
-            //we read max 1024 chars at time, this seems to max what this Reader would return
-            while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
-                if (context.fileIngestIsCancelled()) {
-                    ingester.ingest(this);
-                    return true;
-                }
-                totalRead += readSize;
-
-                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
-                while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
-                        && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
-                    totalRead += readSize;
-                }
-                if (readSize == -1) {
-                    //this is the last chunk
-                    eof = true;
-                } else {
-                    //try to read until whitespace to not break words
-                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
-                            && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
-                            && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
-                        totalRead += readSize;
-                    }
-                    if (readSize == -1) {
-                        //this is the last chunk
-                        eof = true;
-                    }
-                }
-
-                //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
-                //encode to bytes to index as byte stream
-                String extracted;
-
-                //add BOM and trim the 0 bytes
-                //set initial size to chars read + bom - try to prevent from resizing
-                StringBuilder sb = new StringBuilder((int) totalRead + 1000);
-                //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
-                //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
-                if (totalRead < MAX_EXTR_TEXT_CHARS) {
-                    sb.append(textChunkBuf, 0, (int) totalRead);
-                } else {
-                    sb.append(textChunkBuf);
-                }
-
-                //reset for next chunk
-                totalRead = 0;
-                extracted = sb.toString();
-
-                //converts BOM automatically to charSet encoding
-                byte[] encodedBytes = extracted.getBytes(outCharset);
-                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
-                try {
-                    chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
-                    ++this.numChunks;
-                } catch (Ingester.IngesterException ingEx) {
-                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS
-                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
-                    throw ingEx; //need to rethrow/return to signal error and move on
-                }
-            }
-        } catch (IOException ex) {
-            logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-            success = false;
-        } catch (Exception ex) {
-            logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-            success = false;
-        } finally {
-            try {
-                stream.close();
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
-            }
-            try {
-                if (reader != null) {
-                    reader.close();
-                }
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
-            }
-        }
-
-        //after all chunks, ingest the parent file without content itself, and store numChunks
-        ingester.ingest(this);
-        return success;
+    void logWarning(final String msg, Exception ex) {
+        logger.log(Level.WARNING, msg, ex); //NON-NLS
     }
 
     @Override
@@ -216,13 +70,126 @@ class HtmlTextExtractor implements TextExtractor {
 
     @Override
     public boolean isSupported(AbstractFile file, String detectedFormat) {
-        if (detectedFormat == null) {
-            return false;
-        } else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {
-            return true;
-        } else {
-            return false;
-        }
+        return detectedFormat != null
+                && WEB_MIME_TYPES.contains(detectedFormat)
+                && file.getSize() <= MAX_SIZE;
+    }
 
+    /** Parse the stream with Jericho and put the results in a Reader
+     *
+     * @param in an input stream for the content to be parsed by Jericho
+     *
+     * @return a Reader for the parsed content.
+     *
+     * @throws IOException if There is an IOException parsing the input stream.
+     */
+    @Override
+    Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException {
+        try {
+            StringBuilder scripts = new StringBuilder();
+            StringBuilder links = new StringBuilder();
+            StringBuilder images = new StringBuilder();
+            StringBuilder comments = new StringBuilder();
+            StringBuilder others = new StringBuilder();
+            int numScripts = 0;
+            int numLinks = 0;
+            int numImages = 0;
+            int numComments = 0;
+            int numOthers = 0;
+
+            Source source = new Source(in);
+            source.fullSequentialParse();
+            Renderer renderer = source.getRenderer();
+            renderer.setNewLine("\n");
+            renderer.setIncludeHyperlinkURLs(false);
+            renderer.setDecorateFontStyles(false);
+            renderer.setIncludeAlternateText(false);
+
+            String text = renderer.toString();
+            // Get all the tags in the source
+            List<StartTag> tags = source.getAllStartTags();
+
+            StringBuilder stringBuilder = new StringBuilder();
+            for (StartTag tag : tags) {
+                if (tag.getName().equals("script")) {                //NON-NLS
+                    // If the <script> tag has attributes
+                    numScripts++;
+                    scripts.append(numScripts).append(") ");
+                    if (tag.getTagContent().length() > 0) {
+                        scripts.append(tag.getTagContent()).append(" ");
+                    }
+                    // Get whats between the <script> .. </script> tags
+                    scripts.append(tag.getElement().getContent()).append("\n");
+
+                } else if (tag.getName().equals("a")) {
+                    //NON-NLS
+                    numLinks++;
+                    links.append(numLinks).append(") ");
+                    links.append(tag.getTagContent()).append("\n");
+
+                } else if (tag.getName().equals("img")) {
+                    //NON-NLS
+                    numImages++;
+                    images.append(numImages).append(") ");
+                    images.append(tag.getTagContent()).append("\n");
+
+                } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
+                    numComments++;
+                    comments.append(numComments).append(") ");
+                    comments.append(tag.getTagContent()).append("\n");
+
+                } else {
+                    // Make sure it has an attribute
+                    Attributes atts = tag.getAttributes();
+                    if (atts != null && atts.length() > 0) {
+                        numOthers++;
+                        others.append(numOthers).append(") ");
+                        others.append(tag.getName()).append(":");
+                        others.append(tag.getTagContent()).append("\n");
+
+                    }
+                }
+            }
+            stringBuilder.append(text).append("\n\n");
+            stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
+            if (numScripts > 0) {
+                stringBuilder.append("---Scripts---\n"); //NON-NLS
+                stringBuilder.append(scripts).append("\n");
+            }
+            if (numLinks > 0) {
+                stringBuilder.append("---Links---\n"); //NON-NLS
+                stringBuilder.append(links).append("\n");
+            }
+            if (numImages > 0) {
+                stringBuilder.append("---Images---\n"); //NON-NLS
+                stringBuilder.append(images).append("\n");
+            }
+            if (numComments > 0) {
+                stringBuilder.append("---Comments---\n"); //NON-NLS
+                stringBuilder.append(comments).append("\n");
+            }
+            if (numOthers > 0) {
+                stringBuilder.append("---Others---\n"); //NON-NLS
+                stringBuilder.append(others).append("\n");
+            }
+            // All done, now make it a reader
+            return new StringReader(stringBuilder.toString());
+        } catch (IOException ex) {
+            throw new Ingester.IngesterException("Error extracting HTML from content.", ex);
+        }
+    }
+
+    @Override
+    Void newAppendixProvider() {
+        return null;
+    }
+
+    InputStream getInputStream(AbstractFile sourceFile1) {
+        return new ReadContentInputStream(sourceFile1);
+    }
+
+    @Override
+    boolean noExtractionOptionsAreEnabled() {
+        return false;
     }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 256d4508f2..3305a28a8d 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -22,13 +22,13 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.logging.Level;
 import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.util.ContentStream;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.TextUtil;
@@ -42,7 +42,6 @@ import org.sleuthkit.datamodel.Directory;
 import org.sleuthkit.datamodel.File;
 import org.sleuthkit.datamodel.LayoutFile;
 import org.sleuthkit.datamodel.LocalFile;
-import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.sleuthkit.datamodel.SlackFile;
 import org.sleuthkit.datamodel.TskCoreException;
 
@@ -54,13 +53,12 @@ class Ingester {
     private static final Logger logger = Logger.getLogger(Ingester.class.getName());
     private volatile boolean uncommitedIngests = false;
     private final Server solrServer = KeywordSearch.getServer();
-    private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
+    private static final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
     private static Ingester instance;
 
     //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
     //TODO use a streaming way to add content to /update handler
     private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024;
-    private static final String ENCODING = "UTF-8"; //NON-NLS
 
     private Ingester() {
     }
@@ -84,60 +82,22 @@ class Ingester {
     }
 
     /**
-     * Sends a stream to Solr to have its content extracted and added to the
-     * index. commit() should be called once you're done ingesting files.
+     * Indexes the text chunk.
      *
-     * @param afscs File AbstractFileStringContentStream to ingest
+     * @param ingester   An Ingester to do the indexing.
+     * @param chunkBytes The raw bytes of the text chunk.
+     * @param chunkSize  The size of the text chunk in bytes.
      *
-     * @throws IngesterException if there was an error processing a specific
-     *                           file, but the Solr server is probably fine.
+     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    void ingest(AbstractFileStringContentStream afscs) throws IngesterException {
-        Map<String, String> params = getContentFields(afscs.getSourceContent());
-        ingest(afscs, params, afscs.getSourceContent().getSize());
-    }
-
-    /**
-     * Sends a TextExtractor to Solr to have its content extracted and added to
-     * the index. commit() should be called once you're done ingesting files.
-     * FileExtract represents a parent of extracted file with actual content.
-     * The parent itself has no content, only meta data and is used to associate
-     * the extracted AbstractFileChunk
-     *
-     * @param fe TextExtractor to ingest
-     *
-     * @throws IngesterException if there was an error processing a specific
-     *                           file, but the Solr server is probably fine.
-     */
-    void ingest(TextExtractor fe) throws IngesterException {
-        Map<String, String> params = getContentFields(fe.getSourceFile());
-
-        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
-
-        ingest(new NullContentStream(fe.getSourceFile()), params, 0);
-    }
-
-    /**
-     * Sends a AbstractFileChunk to Solr and its extracted content stream to be
-     * added to the index. commit() should be called once you're done ingesting
-     * files. AbstractFileChunk represents a file chunk and its chunk content.
-     *
-     * @param fec  AbstractFileChunk to ingest
-     * @param size approx. size of the stream in bytes, used for timeout
-     *             estimation
-     *
-     * @throws IngesterException if there was an error processing a specific
-     *                           file, but the Solr server is probably fine.
-     */
-    void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException {
-        AbstractContent sourceContent = bcs.getSourceContent();
-        Map<String, String> params = getContentFields(sourceContent);
-
-        //overwrite id with the chunk id
-        params.put(Server.Schema.ID.toString(),
-                Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber()));
-
-        ingest(bcs, params, size);
+    void indexChunk(AbstractFile chunkSource, byte[] chunkBytes, long chunkSize, String chunkID) throws IngesterException {
+        ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, chunkSource);
+        Map<String, String> fields = getContentFields(chunkSource);
+        try {
+            ingest(bcs, fields, chunkBytes.length);
+        } catch (Exception ex) {
+            throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkID), ex);
+        }
     }
 
     /**
@@ -153,12 +113,25 @@ class Ingester {
      * @throws IngesterException if there was an error processing a specific
      *                           file, but the Solr server is probably fine.
      */
-    void ingest(AbstractFile file, boolean ingestContent) throws IngesterException {
-        if (ingestContent == false || file.isDir()) {
-            ingest(new NullContentStream(file), getContentFields(file), 0);
-        } else {
-            ingest(new FscContentStream(file), getContentFields(file), file.getSize());
-        }
+    void indexMetaDataOnly(AbstractFile file) throws IngesterException {
+        ingest(new NullContentStream(file), getContentFields(file), 0);
+    }
+    /**
+     * Sends a TextExtractor to Solr to have its content extracted and added to
+     * the index. commit() should be called once you're done ingesting files.
+     * FileExtract represents a parent of extracted file with actual content.
+     * The parent itself has no content, only meta data and is used to associate
+     * the extracted AbstractFileChunk
+     *
+     * @param fe TextExtractor to ingest
+     *
+     * @throws IngesterException if there was an error processing a specific
+     *                           file, but the Solr server is probably fine.
+     */
+    void recordNumberOfChunks(AbstractFile file, int numChunks) throws IngesterException {
+        Map<String, String> params = getContentFields(file);
+        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
+        ingest(new NullContentStream(file), params, 0);
     }
 
     /**
@@ -168,14 +141,14 @@ class Ingester {
      *
      * @return the map
      */
-    private Map<String, String> getContentFields(AbstractContent fsc) {
+    Map<String, String> getContentFields(AbstractContent fsc) {
         return fsc.accept(getContentFieldsV);
     }
 
     /**
      * Visitor used to create param list to send to SOLR index.
      */
-    private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
+    static private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
 
         @Override
         protected Map<String, String> defaultVisit(Content cntnt) {
@@ -304,25 +277,19 @@ class Ingester {
 
             if (read != 0) {
                 String s = "";
-                try {
-                    s = new String(docChunkContentBuf, 0, read, ENCODING);
-                    // Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index
-                    char[] chars = null;
-                    for (int i = 0; i < s.length(); i++) {
-                        if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
-                            // only convert string to char[] if there is a non-UTF8 character
-                            if (chars == null) {
-                                chars = s.toCharArray();
-                            }
-                            chars[i] = '^';
+                s = new String(docChunkContentBuf, 0, read, StandardCharsets.UTF_8);
+                char[] chars = null;
+                for (int i = 0; i < s.length(); i++) {
+                    if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
+                        // only convert string to char[] if there is a non-UTF8 character
+                        if (chars == null) {
+                            chars = s.toCharArray();
                         }
+                        chars[i] = '^';
                     }
-                    // check if the string was modified (i.e. there was a non-UTF8 character found)
-                    if (chars != null) {
-                        s = new String(chars);
-                    }
-                } catch (UnsupportedEncodingException ex) {
-                    logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS
+                }
+                if (chars != null) {
+                    s = new String(chars);
                 }
                 updateDoc.addField(Server.Schema.CONTENT.toString(), s);
             } else {
@@ -380,48 +347,6 @@ class Ingester {
         }
     }
 
-    /**
-     * ContentStream to read() the data from a FsContent object
-     */
-    private static class FscContentStream implements ContentStream {
-
-        private AbstractFile f;
-
-        FscContentStream(AbstractFile f) {
-            this.f = f;
-        }
-
-        @Override
-        public String getName() {
-            return f.getName();
-        }
-
-        @Override
-        public String getSourceInfo() {
-            return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId());
-        }
-
-        @Override
-        public String getContentType() {
-            return null;
-        }
-
-        @Override
-        public Long getSize() {
-            return f.getSize();
-        }
-
-        @Override
-        public InputStream getStream() throws IOException {
-            return new ReadContentInputStream(f);
-        }
-
-        @Override
-        public Reader getReader() throws IOException {
-            throw new UnsupportedOperationException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader"));
-        }
-    }
 
     /**
      * ContentStream associated with FsContent, but forced with no content
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index c0ced07107..c3b997fa58 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -512,7 +512,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                     if (context.fileIngestIsCancelled()) {
                         return;
                     }
-                    ingester.ingest(aFile, false); //meta-data only
+                    ingester.indexMetaDataOnly(aFile);
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
                 } catch (IngesterException ex) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
@@ -539,7 +539,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                     if (context.fileIngestIsCancelled()) {
                         return;
                     }
-                    ingester.ingest(aFile, false); //meta-data only
+                    ingester.indexMetaDataOnly(aFile);
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
                 } catch (IngesterException ex) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
index 3bbc97dcfc..93c6c786fa 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2011-2014 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,155 +20,119 @@ package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.Charset;
+import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.TskCoreException;
+import org.sleuthkit.datamodel.TskException;
 
 /**
- * Takes an AbstractFile, extract strings, converts into chunks (associated with
- * the original source file) up to 1MB then and indexes chunks as text with Solr
+ * Takes an AbstractFile, extracts strings, converts into chunks (associated
+ * with the original source file) up to 1MB then and indexes chunks as text with
+ * Solr.
  */
-class StringsTextExtractor implements TextExtractor {
+class StringsTextExtractor extends TextExtractorBase<Void> {
 
-    private static Ingester ingester;
     private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
-    //private static final int BOM_LEN = 3; 
-    private static final int BOM_LEN = 0;  //disabled prepending of BOM
-    private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
-    private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
-    private AbstractFile sourceFile;
-    private int numChunks = 0;
     private final List<SCRIPT> extractScripts = new ArrayList<>();
     private Map<String, String> extractOptions = new HashMap<>();
 
-    //disabled prepending of BOM
-    //static {
-    //prepend UTF-8 BOM to start of the buffer
-    //stringChunkBuf[0] = (byte) 0xEF;
-    //stringChunkBuf[1] = (byte) 0xBB;
-    //stringChunkBuf[2] = (byte) 0xBF;
-    //}
     public StringsTextExtractor() {
-        ingester = Server.getIngester();
-        extractScripts.add(DEFAULT_SCRIPT);
+        //LATIN_2 is the default script
+        extractScripts.add(SCRIPT.LATIN_2);
     }
 
-    @Override
+    /**
+     * Sets the scripts to use for the extraction
+     *
+     * @param extractScripts scripts to use
+     *
+     * @return true if extractor supports script - specific extraction, false
+     *         otherwise
+     */
     public boolean setScripts(List<SCRIPT> extractScripts) {
         this.extractScripts.clear();
         this.extractScripts.addAll(extractScripts);
         return true;
     }
 
-    @Override
+    /**
+     * Get the currently used scripts for extraction
+     *
+     * @return scripts currently used or null if not supported
+     */
     public List<SCRIPT> getScripts() {
         return new ArrayList<>(extractScripts);
     }
 
-    @Override
-    public int getNumChunks() {
-        return this.numChunks;
-    }
-
-    @Override
-    public AbstractFile getSourceFile() {
-        return sourceFile;
-    }
-
-    @Override
+    /**
+     * Get current options
+     *
+     * @return currently used, extractor specific options, or null of not
+     *         supported
+     */
     public Map<String, String> getOptions() {
         return extractOptions;
     }
 
-    @Override
+    /**
+     * Set extractor specific options
+     *
+     * @param options options to use
+     */
     public void setOptions(Map<String, String> options) {
         this.extractOptions = options;
     }
 
     @Override
-    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
-        this.sourceFile = sourceFile;
-        this.numChunks = 0; //unknown until indexing is done
-        boolean success = false;
+    boolean noExtractionOptionsAreEnabled() {
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
-        final boolean extractUTF8
-                = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        return extractUTF8 == false && extractUTF16 == false;
+    }
 
-        final boolean extractUTF16
-                = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+    @Override
+    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException {
+        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
+    }
 
-        if (extractUTF8 == false && extractUTF16 == false) {
-            //nothing to do
-            return true;
-        }
+    /**
+     * Get the appropriate input stream to read the content of the given
+     * AbstractFile.
+     *
+     * @return an appropriate input stream to read the content of the given
+     *         AbstractFile
+     *
+     * @param sourceFile   The AbstractFile to create an input stream for
+     * @param extractUTF8  Should the the stream extract UTF8
+     * @param extractUTF16 Should the the stream extract UTF16
+     *
+     * @return An InputStream for reading the contents of the AbstractFile
+     */
+    @Override
+    InputStream getInputStream(AbstractFile sourceFile) {
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
-        InputStream stringStream;
         //check which extract stream to use
-        if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
-            //optimal for english, english only
-            stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
-        } else {
-            stringStream = new AbstractFileStringIntStream(
-                    sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
-        }
-
-        try {
-            success = true;
-            //break input stream into chunks 
-
-            final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
-            long readSize;
-            while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
-                if (context.fileIngestIsCancelled()) {
-                    ingester.ingest(this);
-                    return true;
-                }
-                //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
-                //debug.write(stringChunkBuf, 0, (int)readSize);
-
-                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
-
-                try {
-                    chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
-                    ++this.numChunks;
-                } catch (IngesterException ingEx) {
-                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
-                    throw ingEx; //need to rethrow/return to signal error and move on
-                }
-
-                //debug.close();    
-            }
-
-            //after all chunks, ingest the parent file without content itself, and store numChunks
-            ingester.ingest(this);
-
-        } catch (IOException ex) {
-            logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
-            success = false;
-        } finally {
-            try {
-                stringStream.close();
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
-            }
-        }
-
-        return success;
+        InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)
+                ? new AbstractFileStringStream(sourceFile)//optimal for english, english only
+                : new AbstractFileStringIntStream(sourceFile, extractScripts, extractUTF8, extractUTF16);
+        return stringStream;
     }
 
     @Override
     public boolean isContentTypeSpecific() {
-        return true;
+        return false;
     }
 
     @Override
@@ -176,4 +140,389 @@ class StringsTextExtractor implements TextExtractor {
         // strings can be run on anything. 
         return true;
     }
+
+    @Override
+    void logWarning(String msg, Exception ex) {
+        logger.log(Level.WARNING, msg, ex); //NON-NLS
+    }
+
+    @Override
+    Void newAppendixProvider() {
+        return null;
+    }
+
+    /**
+     * AbstractFile input string stream reader/converter - given AbstractFile,
+     * extract strings from it and return encoded bytes via read()
+     *
+     * Note: the utility supports extraction of only LATIN script and UTF8,
+     * UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
+     * it's fast but could apply multiple encodings on the same string.
+     *
+     * For other script/languages support and better encoding detection use
+     * AbstractFileStringIntStream streaming class, which wraps around
+     * StringExtract extractor.
+     */
+    private static class AbstractFileStringStream extends InputStream {
+
+        private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
+        private static final String NLS = Character.toString((char) 10); //new line
+        private static final int READ_BUF_SIZE = 256;
+        private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
+
+        //args
+        private final AbstractFile content;
+
+        //internal working data
+        private long contentOffset = 0; //offset in fscontent read into curReadBuf
+        private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
+        private int bytesInReadBuf = 0;
+        private int readBufOffset = 0; //offset in read buf processed
+        private StringBuilder curString = new StringBuilder();
+        private int curStringLen = 0;
+        private StringBuilder tempString = new StringBuilder();
+        private int tempStringLen = 0;
+        private boolean isEOF = false;
+        private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
+        private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
+        private boolean inString = false; //if current temp has min chars required
+        private final byte[] oneCharBuf = new byte[1];
+
+        /**
+         * Construct new string stream from FsContent Do not attempt to fill
+         * entire read buffer if that would break a string
+         *
+         * @param content       to extract strings from
+         * @param outputCharset target charset to encode into bytes and index
+         *                      as, e.g. UTF-8
+         *
+         */
+        private AbstractFileStringStream(AbstractFile content) {
+            this.content = content;
+        }
+
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            if (b == null) {
+                throw new NullPointerException();
+            } else if (off < 0 || len < 0 || len > b.length - off) {
+                throw new IndexOutOfBoundsException();
+            } else if (len == 0) {
+                return 0;
+            }
+            long fileSize = content.getSize();
+            if (fileSize == 0) {
+                return -1;
+            }
+            if (isEOF) {
+                return -1;
+            }
+            if (stringAtTempBoundary) {
+                //append entire temp string residual from previous read()
+                //because qualified string was broken down into 2 parts
+                appendResetTemp();
+                stringAtTempBoundary = false;
+                //there could be more to this string in fscontent/buffer
+            }
+            boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
+            int newCurLen = curStringLen + tempStringLen;
+            while (newCurLen < len) {
+                //need to extract more strings
+                if (readBufOffset > bytesInReadBuf - 1) {
+                    //no more bytes to process into strings, read them
+                    try {
+                        bytesInReadBuf = 0;
+                        bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
+                    } catch (TskException ex) {
+                        if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
+                            appendResetTemp();
+                            //have some extracted string, return that, and fail next time
+                            isEOF = true;
+                            int copied = copyToReturn(b, off, len);
+                            return copied;
+                        } else {
+                            return -1; //EOF
+                        }
+                    }
+                    if (bytesInReadBuf < 1) {
+                        if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
+                            appendResetTemp();
+                            //have some extracted string, return that, and fail next time
+                            isEOF = true;
+                            int copied = copyToReturn(b, off, len);
+                            return copied;
+                        } else {
+                            return -1; //EOF
+                        }
+                    }
+                    //increment content offset for next read
+                    contentOffset += bytesInReadBuf;
+                    //reset read buf position
+                    readBufOffset = 0;
+                }
+                //get char from cur read buf
+                char c = (char) curReadBuf[readBufOffset++];
+                if (c == 0 && singleConsecZero == false) {
+                    //preserve the current sequence if max consec. 1 zero char
+                    singleConsecZero = true;
+                } else {
+                    singleConsecZero = false;
+                }
+                if (StringExtract.isPrintableAscii(c)) {
+                    tempString.append(c);
+                    ++tempStringLen;
+                    if (tempStringLen >= MIN_PRINTABLE_CHARS) {
+                        inString = true;
+                    }
+                    //boundary case when temp has still chars - handled after the loop
+                } else if (!singleConsecZero) {
+                    //break the string, clear temp
+                    if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
+                        //append entire temp string with new line
+                        tempString.append(NLS);
+                        ++tempStringLen;
+                        curString.append(tempString);
+                        curStringLen += tempStringLen;
+                        stringAtBufBoundary = false;
+                    }
+                    //reset temp
+                    tempString = new StringBuilder();
+                    tempStringLen = 0;
+                }
+                newCurLen = curStringLen + tempStringLen;
+            }
+            //check if still in string state, so that next chars in read buf bypass min chars check
+            //and qualify as string even if less < min chars required
+            if (inString) {
+                inString = false; //reset
+                stringAtBufBoundary = true; //will bypass the check
+            }
+            //check if temp still has chars to qualify as a string
+            //we might need to break up temp into 2 parts for next read() call
+            //consume as many as possible to fill entire user buffer
+            if (tempStringLen >= MIN_PRINTABLE_CHARS) {
+                if (newCurLen > len) {
+                    int appendChars = len - curStringLen;
+                    //save part for next user read(), need to break up temp string
+                    //do not append new line
+                    String toAppend = tempString.substring(0, appendChars);
+                    String newTemp = tempString.substring(appendChars);
+                    curString.append(toAppend);
+                    curStringLen += appendChars;
+                    tempString = new StringBuilder(newTemp);
+                    tempStringLen = newTemp.length();
+                    stringAtTempBoundary = true;
+                } else {
+                    //append entire temp
+                    curString.append(tempString);
+                    curStringLen += tempStringLen;
+                    //reset temp
+                    tempString = new StringBuilder();
+                    tempStringLen = 0;
+                }
+            } else {
+                //if temp has a few chars, not qualified as string for now,
+                //will be processed during next read() call
+            }
+            //copy current strings to user
+            final int copied = copyToReturn(b, off, len);
+            //there may be still chars in read buffer or  tempString, for next read()
+            return copied;
+        }
+
+        //append temp buffer to cur string buffer and reset temp, if enough chars
+        //does not append new line
+        private void appendResetTemp() {
+            if (tempStringLen >= MIN_PRINTABLE_CHARS) {
+                curString.append(tempString);
+                curStringLen += tempStringLen;
+                tempString = new StringBuilder();
+                tempStringLen = 0;
+            }
+        }
+
+        //copy currently extracted string to user buffer
+        //and reset for next read() call
+        private int copyToReturn(byte[] b, int off, long len) {
+            final String curStringS = curString.toString();
+            //logger.log(Level.INFO, curStringS);
+            byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+            System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
+            //logger.log(Level.INFO, curStringS);
+            //copied all string, reset
+            curString = new StringBuilder();
+            int ret = curStringLen;
+            curStringLen = 0;
+            return ret;
+        }
+
+        @Override
+        public int read() throws IOException {
+            final int read = read(oneCharBuf, 0, 1);
+            if (read == 1) {
+                return oneCharBuf[0];
+            } else {
+                return -1;
+            }
+        }
+
+        @Override
+        public int available() throws IOException {
+            //we don't know how many bytes in curReadBuf may end up as strings
+            return 0;
+        }
+
+        @Override
+        public long skip(long n) throws IOException {
+            //use default implementation that reads into skip buffer
+            //but it could be more efficient
+            return super.skip(n);
+        }
+    }
+
+    /**
+     * Wrapper over StringExtract to provide streaming API Given AbstractFile
+     * object, extract international strings from the file and read output as a
+     * stream of UTF-8 strings as encoded bytes.
+     *
+     */
+    private static class AbstractFileStringIntStream extends InputStream {
+
+        private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
+        private static final int FILE_BUF_SIZE = 1024 * 1024;
+        private final AbstractFile content;
+        private final byte[] oneCharBuf = new byte[1];
+        private final StringExtract stringExtractor;
+        /** true if there is nothing to do because neither extractUTF8 nor
+         * extractUTF16 was true in constructor */
+        private final boolean nothingToDo;
+        private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
+        private long fileReadOffset = 0L;
+        private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
+        private int convertBuffOffset = 0; //offset to start returning data to user on next read()
+        private int bytesInConvertBuff = 0; //amount of data currently in the buffer
+        private boolean fileEOF = false; //if file has more bytes to read
+        private StringExtract.StringExtractResult lastExtractResult;
+
+        /**
+         * Constructs new stream object that does conversion from file, to
+         * extracted strings, then to byte stream, for specified script,
+         * auto-detected encoding (UTF8, UTF16LE, UTF16BE), and specified output
+         * byte stream encoding
+         *
+         * @param content      input content to process and turn into a stream
+         *                     to convert into strings
+         * @param scripts      a list of scripts to consider
+         * @param extractUTF8  whether to extract utf8 encoding
+         * @param extractUTF16 whether to extract utf16 encoding
+         */
+        private AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
+            this.content = content;
+            this.stringExtractor = new StringExtract();
+            this.stringExtractor.setEnabledScripts(scripts);
+            this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
+            this.stringExtractor.setEnableUTF8(extractUTF8);
+            this.stringExtractor.setEnableUTF16(extractUTF16);
+        }
+
+        @Override
+        public int read() throws IOException {
+            if (nothingToDo) {
+                return -1;
+            }
+            final int read = read(oneCharBuf, 0, 1);
+            if (read == 1) {
+                return oneCharBuf[0];
+            } else {
+                return -1;
+            }
+        }
+
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            if (b == null) {
+                throw new NullPointerException();
+            } else if (off < 0 || len < 0 || len > b.length - off) {
+                throw new IndexOutOfBoundsException();
+            } else if (len == 0) {
+                return 0;
+            }
+            if (nothingToDo) {
+                return -1;
+            }
+            long fileSize = content.getSize();
+            if (fileSize == 0) {
+                return -1;
+            }
+            //read and convert until user buffer full
+            //we have data if file can be read or when byteBuff has converted strings to return
+            int bytesToUser = 0; //returned to user so far
+            int offsetUser = off;
+            while (bytesToUser < len && offsetUser < len) {
+                //check if we have enough converted strings
+                int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
+                if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
+                    try {
+                        //convert more strings, store in buffer
+                        long toRead = 0;
+
+                        //fill up entire fileReadBuff fresh
+                        toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
+                        //}
+                        int read = content.read(fileReadBuff, fileReadOffset, toRead);
+                        if (read == -1 || read == 0) {
+                            fileEOF = true;
+                        } else {
+                            fileReadOffset += read;
+                            if (fileReadOffset >= fileSize) {
+                                fileEOF = true;
+                            }
+                            //put converted string in convertBuff
+                            convert(read);
+                            convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
+                        }
+                    } catch (TskCoreException ex) {
+                        //Exceptions.printStackTrace(ex);
+                        fileEOF = true;
+                    }
+                }
+                //nothing more to read, and no more bytes in convertBuff
+                if (convertBuff == null || convertBuffRemain == 0) {
+                    if (fileEOF) {
+                        return bytesToUser > 0 ? bytesToUser : -1;
+                    } else {
+                        //no strings extracted, try another read
+                        continue;
+                    }
+                }
+                //return part or all of convert buff to user
+                final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
+                System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
+
+                convertBuffOffset += toCopy;
+                offsetUser += toCopy;
+                bytesToUser += toCopy;
+            }
+            //if more string data in convertBuff, will be consumed on next read()
+            return bytesToUser;
+        }
+
+        /**
+         * convert bytes in file buffer to string, and encode string in
+         * convertBuffer
+         *
+         * @param numBytes num bytes in the fileReadBuff
+         */
+        private void convert(int numBytes) {
+            lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
+            convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+            //reset tracking vars
+            if (lastExtractResult.getNumBytes() == 0) {
+                bytesInConvertBuff = 0;
+            } else {
+                bytesInConvertBuff = convertBuff.length;
+            }
+            convertBuffOffset = 0;
+        }
+    }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index 949430bf72..dfd9880ae1 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2012 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,8 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch;
 
 import java.util.Arrays;
 import java.util.List;
-import java.util.Map;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.datamodel.AbstractFile;
 
@@ -31,6 +29,10 @@ import org.sleuthkit.datamodel.AbstractFile;
  */
 interface TextExtractor {
 
+    default Ingester getIngester() {
+        return Server.getIngester();
+    }
+
     /**
      * Common options that can be used by some extractors
      */
@@ -40,8 +42,14 @@ interface TextExtractor {
         EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
     };
 
-    //generally text extractors should ignore archives
-    //and let unpacking modules take case of them
+    static final List<String> BLOB_MIME_TYPES
+            = Arrays.asList(
+                    //ignore binary blob data, for which string extraction will be used
+                    "application/octet-stream", //NON-NLS
+                    "application/x-msdownload"); //NON-NLS
+
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
     static final List<String> ARCHIVE_MIME_TYPES
             = Arrays.asList(
                     //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
@@ -77,19 +85,6 @@ interface TextExtractor {
                     "application/x-z", //NON-NLS
                     "application/x-compress"); //NON-NLS
 
-    /**
-     * Get number of chunks resulted from extracting this AbstractFile
-     *
-     * @return the number of chunks produced
-     */
-    int getNumChunks();
-
-    /**
-     * Get the source file associated with this extraction
-     *
-     * @return the source AbstractFile
-     */
-    AbstractFile getSourceFile();
 
     /**
      * Index the Abstract File
@@ -102,38 +97,6 @@ interface TextExtractor {
      */
     boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;
 
-    /**
-     * Sets the scripts to use for the extraction
-     *
-     * @param extractScripts scripts to use
-     *
-     * @return true if extractor supports script - specific extraction, false
-     *         otherwise
-     */
-    boolean setScripts(List<SCRIPT> extractScript);
-
-    /**
-     * Get the currently used scripts for extraction
-     *
-     * @return scripts currently used or null if not supported
-     */
-    List<SCRIPT> getScripts();
-
-    /**
-     * Get current options
-     *
-     * @return currently used, extractor specific options, or null of not
-     *         supported
-     */
-    Map<String, String> getOptions();
-
-    /**
-     * Set extractor specific options
-     *
-     * @param options options to use
-     */
-    void setOptions(Map<String, String> options);
-
     /**
      * Determines if the extractor works only for specified types is
      * supportedTypes() or whether is a generic content extractor (such as
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java
new file mode 100644
index 0000000000..5e956e4e63
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java
@@ -0,0 +1,149 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2016 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import org.sleuthkit.autopsy.coreutils.TextUtil;
+import org.sleuthkit.autopsy.ingest.IngestJobContext;
+import org.sleuthkit.datamodel.AbstractFile;
+
+public abstract class TextExtractorBase<AppendixProvder> implements TextExtractor {
+
+    static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
+    static final int SINGLE_READ_CHARS = 1024;
+    static final int EXTRA_CHARS = 128; //for whitespace
+    static final int MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024;  //in bytes
+
+
+    abstract void logWarning(final String msg, Exception ex);
+
+    void appendDataToFinalChunk(StringBuilder sb, AppendixProvder dataProvider) {
+        //no-op
+    }
+
+    /**
+     * Sanitize the given chars by replacing non-UTF-8 characters with caret '^'
+     *
+     * @param totalRead    the number of chars in textChunkBuf
+     * @param textChunkBuf the characters to sanitize
+     */
+    static void sanitizeToUTF8(StringBuilder sb) {
+        final int length = sb.length();
+
+        // Sanitize by replacing non-UTF-8 characters with caret '^'
+        for (int i = 0; i < length; i++) {
+            if (!TextUtil.isValidSolrUTF8(sb.charAt(i))) {
+                sb.replace(i, i + 1, "^'");
+            }
+        }
+    }
+
+    abstract boolean noExtractionOptionsAreEnabled();
+
+    abstract AppendixProvder newAppendixProvider();
+
+    @Override
+    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
+        int numChunks = 0; //unknown until indexing is done
+
+        if (noExtractionOptionsAreEnabled()) {
+            return true;
+        }
+        AppendixProvder appendix = newAppendixProvider();
+        try (final InputStream stream = getInputStream(sourceFile);
+                Reader reader = getReader(stream, sourceFile, appendix);) {
+
+            //we read max 1024 chars at time, this seems to max what this Reader would return
+            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
+            long readSize;
+            boolean eof = false;
+            while (!eof) {
+                int totalRead = 0;
+                if (context.fileIngestIsCancelled()) {
+                    return true;
+                }
+                if ((readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) == -1) {
+                    eof = true;
+                } else {
+                    totalRead += readSize;
+                }
+
+                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
+                while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
+                        && (readSize = reader.read(textChunkBuf, totalRead, SINGLE_READ_CHARS)) != -1) {
+                    totalRead += readSize;
+                }
+                if (readSize == -1) {
+                    //this is the last chunk
+                    eof = true;
+                } else {
+                    //try to read char-by-char until whitespace to not break words
+                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
+                            && !Character.isWhitespace(textChunkBuf[totalRead - 1])
+                            && (readSize = reader.read(textChunkBuf, totalRead, 1)) != -1) {
+                        totalRead += readSize;
+                    }
+                    if (readSize == -1) {
+                        //this is the last chunk
+                        eof = true;
+                    }
+                }
+
+                StringBuilder sb = new StringBuilder(totalRead + 1000)
+                        .append(textChunkBuf, 0, totalRead);
+
+                if (eof) {
+                    appendDataToFinalChunk(sb, appendix);
+                }
+                sanitizeToUTF8(sb);
+
+                final String chunkString = sb.toString();
+
+                //encode to bytes as UTF-8 to index as byte stream
+                byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+                String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
+                try {
+                    getIngester().indexChunk(sourceFile, encodedBytes, encodedBytes.length, chunkId);
+                    numChunks++;
+                } catch (Ingester.IngesterException ingEx) {
+                    logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS
+
+                    throw ingEx; //need to rethrow to signal error and move on
+                }
+            }
+        } catch (IOException ex) {
+            logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            return false;
+        } catch (Exception ex) {
+            logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            return false;
+        } finally {
+            //after all chunks, ingest the parent file without content itself, and store numChunks
+            getIngester().recordNumberOfChunks(sourceFile, numChunks);
+        }
+        return true;
+    }
+
+    abstract InputStream getInputStream(AbstractFile sourceFile1);
+
+    abstract Reader getReader(InputStream stream, AbstractFile sourceFile, AppendixProvder appendix) throws Ingester.IngesterException;
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
index 479931a754..8df551dcf6 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2012-2013 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,31 +18,23 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
 import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.MissingResourceException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
-import org.sleuthkit.autopsy.coreutils.TextUtil;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@@ -57,276 +49,86 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * parsers-supported content type.
  *
  */
-class TikaTextExtractor implements TextExtractor {
+class TikaTextExtractor extends TextExtractorBase<Metadata> {
 
     private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
-    private static Ingester ingester;
-    private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
-    private static final int SINGLE_READ_CHARS = 1024;
-    private static final int EXTRA_CHARS = 128; //for whitespace
-    private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
-    private AbstractFile sourceFile; //currently processed file
-    private int numChunks = 0;
+
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
-    private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
 
-    TikaTextExtractor() {
-        ingester = Server.getIngester();
+    private static final List<String> TIKA_SUPPORTED_TYPES
+            = new Tika().getParser().getSupportedTypes(new ParseContext())
+            .parallelStream()
+            .map(mt -> mt.getType() + "/" + mt.getSubtype())
+            .collect(Collectors.toList());
 
-        Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
-        for (MediaType mt : mediaTypes) {
-            TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
-        }
-        //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
+    @Override
+    void logWarning(final String msg, Exception ex) {
+        KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
+        logger.log(Level.WARNING, msg, ex);
     }
 
     @Override
-    public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
-        return false;
+    Metadata newAppendixProvider() {
+        return new Metadata();
     }
 
     @Override
-    public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
-        return null;
+    public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
+
+        //TODO: How do we account for this in chunking algorithm...
+        sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
+        Stream.of(meta.names()).sorted().forEach(key -> {
+            sb.append(key).append(": ").append(meta.get(key)).append("\n");
+        });
     }
 
     @Override
-    public Map<String, String> getOptions() {
-        return null;
-    }
-
-    @Override
-    public void setOptions(Map<String, String> options) {
-    }
-
-    @Override
-    public int getNumChunks() {
-        return numChunks;
-    }
-
-    @Override
-    public AbstractFile getSourceFile() {
-        return sourceFile;
-    }
-
-    @Override
-    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
-        this.sourceFile = sourceFile;
-        numChunks = 0; //unknown until indexing is done
-
-        boolean success = false;
-        Reader reader = null;
-        final InputStream stream = new ReadContentInputStream(sourceFile);
+    Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException {
+        //Parse the file in a task
+        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
         try {
-            Metadata meta = new Metadata();
-
-            //Parse the file in a task
-            Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
-            ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
-            final Future<?> future = tikaParseExecutor.submit(parseTask);
-            try {
-                future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
-            } catch (TimeoutException te) {
-                final String msg = NbBundle.getMessage(this.getClass(),
-                        "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
-                        sourceFile.getId(), sourceFile.getName());
-                KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
-                logger.log(Level.WARNING, msg);
-                throw new IngesterException(msg);
-            } catch (Exception ex) {
-                final String msg = NbBundle.getMessage(this.getClass(),
-                        "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
-                        sourceFile.getId(), sourceFile.getName());
-                KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-                logger.log(Level.WARNING, msg);
-                throw new IngesterException(msg);
-            }
-
-            // get the reader with the results
-            reader = parseTask.getReader();
-            if (reader == null) {
-                //likely due to exception in parse()
-                logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
-                return false;
-            }
-
-            // break the results into chunks and index
-            success = true;
-            long readSize;
-            long totalRead = 0;
-            boolean eof = false;
-            //we read max 1024 chars at time, this seems to max what this Reader would return
-            while (!eof) {
-                if (context.fileIngestIsCancelled()) {
-                    ingester.ingest(this);
-                    return true;
-                }
-                readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
-                if (readSize == -1) {
-                    eof = true;
-                } else {
-                    totalRead += readSize;
-                }
-                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
-                while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
-                        && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
-                    totalRead += readSize;
-                }
-                if (readSize == -1) {
-                    //this is the last chunk
-                    eof = true;
-                } else {
-                    //try to read char-by-char until whitespace to not break words
-                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
-                            && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
-                            && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
-                        totalRead += readSize;
-                    }
-                    if (readSize == -1) {
-                        //this is the last chunk
-                        eof = true;
-                    }
-                }
-
-                // Sanitize by replacing non-UTF-8 characters with caret '^'
-                for (int i = 0; i < totalRead; ++i) {
-                    if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
-                        textChunkBuf[i] = '^';
-                    }
-                }
-
-                StringBuilder sb = new StringBuilder((int) totalRead + 1000);
-                sb.append(textChunkBuf, 0, (int) totalRead);
-
-                //reset for next chunk
-                totalRead = 0;
-
-                //append meta data if last chunk
-                if (eof) {
-                    //sort meta data keys
-                    List<String> sortedKeyList = Arrays.asList(meta.names());
-                    Collections.sort(sortedKeyList);
-                    sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
-                    for (String key : sortedKeyList) {
-                        String value = meta.get(key);
-                        sb.append(key).append(": ").append(value).append("\n");
-                    }
-                }
-
-                // Encode from UTF-8 charset to bytes
-                byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
-                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
-                try {
-                    chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
-                    ++this.numChunks;
-                } catch (Ingester.IngesterException ingEx) {
-                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
-                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
-                    throw ingEx; //need to rethrow/return to signal error and move on
-                }
-            }
-        } catch (IOException ex) {
-            final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
-            KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-            logger.log(Level.WARNING, msg);
-            success = false;
+            return future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+        } catch (TimeoutException te) {
+            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
+            logWarning(msg, te);
+            throw new IngesterException(msg);
         } catch (Exception ex) {
-            final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
-            KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-            logger.log(Level.WARNING, msg);
-            success = false;
-        } finally {
-            try {
-                stream.close();
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
-            }
-            try {
-                if (reader != null) {
-                    reader.close();
-                }
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
-            }
+            KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
+            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
+            logWarning(msg, ex);
+            throw new IngesterException(msg, ex);
         }
-
-        //after all chunks, ingest the parent file without content itself, and store numChunks
-        ingester.ingest(this);
-
-        return success;
     }
 
     @Override
+
     public boolean isContentTypeSpecific() {
         return true;
     }
 
     @Override
     public boolean isSupported(AbstractFile file, String detectedFormat) {
-        if (detectedFormat == null) {
-            return false;
-        } else if (detectedFormat.equals("application/octet-stream") //NON-NLS
-                || detectedFormat.equals("application/x-msdownload")) { //NON-NLS
-            //any binary unstructured blobs (string extraction will be used)
-            return false;
-        } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
-            return false;
-        } //skip video other than flv (tika supports flv only)
-        else if (detectedFormat.contains("video/") //NON-NLS
-                && !detectedFormat.equals("video/x-flv")) { //NON-NLS
-            return false;
-        } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
-            // Tika currently has a bug in the ttf parser in fontbox.
-            // It will throw an out of memory exception
+        if (detectedFormat == null
+                || TextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
+                || TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
+                || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
+                || detectedFormat.equals("application/x-font-ttf")) {   // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
+
             return false;
         }
 
         //TODO might need to add more mime-types to ignore
         //then accept all formats supported by Tika
         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
-
     }
 
-    /**
-     * Runnable task that calls tika to parse the content using the input
-     * stream. Provides reader for results.
-     */
-    private static class ParseRequestTask implements Runnable {
+    InputStream getInputStream(AbstractFile sourceFile1) {
+        return new ReadContentInputStream(sourceFile1);
+    }
 
-        //in
-        private Tika tika;
-        private InputStream stream;
-        private Metadata meta;
-        private AbstractFile sourceFile;
-        //out
-        private Reader reader;
-
-        ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
-            this.tika = tika;
-            this.stream = stream;
-            this.meta = meta;
-            this.sourceFile = sourceFile;
-        }
-
-        @Override
-        public void run() {
-            try {
-                reader = tika.parse(stream, meta);
-            } catch (IOException ex) {
-                KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-                tika = null;
-                reader = null;
-            } catch (Exception ex) {
-                KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-                tika = null;
-                reader = null;
-            }
-        }
-
-        public Reader getReader() {
-            return reader;
-        }
+    @Override
+    boolean noExtractionOptionsAreEnabled() {
+        return false;
     }
 }

From c9795cabcba1be7553117e1c99b1a52ab406ebf1 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Mon, 12 Dec 2016 16:49:30 +0100
Subject: [PATCH 03/21] pull up methods from TextExtractorBase into
 TextExtractor.java

---
 .../keywordsearch/HtmlTextExtractor.java      |   6 +-
 .../autopsy/keywordsearch/Ingester.java       | 114 +++++++++++++-
 .../KeywordSearchIngestModule.java            |  34 +---
 .../keywordsearch/SolrSearchService.java      |   4 +-
 .../keywordsearch/StringsTextExtractor.java   |   2 +-
 .../autopsy/keywordsearch/TextExtractor.java  |  47 ++++--
 .../keywordsearch/TextExtractorBase.java      | 149 ------------------
 .../keywordsearch/TikaTextExtractor.java      |   2 +-
 8 files changed, 153 insertions(+), 205 deletions(-)
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
index 6e8a57e258..07dde1c818 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * divided into chunks and indexed with Solr. If HTML extraction succeeds,
  * chunks are indexed with Solr.
  */
-class HtmlTextExtractor extends TextExtractorBase<Void> {
+class HtmlTextExtractor extends TextExtractor<Void> {
 
     private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
 
@@ -64,12 +64,12 @@ class HtmlTextExtractor extends TextExtractorBase<Void> {
     }
 
     @Override
-    public boolean isContentTypeSpecific() {
+    boolean isContentTypeSpecific() {
         return true;
     }
 
     @Override
-    public boolean isSupported(AbstractFile file, String detectedFormat) {
+    boolean isSupported(AbstractFile file, String detectedFormat) {
         return detectedFormat != null
                 && WEB_MIME_TYPES.contains(detectedFormat)
                 && file.getSize() <= MAX_SIZE;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 3305a28a8d..60e6657cf5 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -33,6 +33,7 @@ import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.TextUtil;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.datamodel.AbstractContent;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.Content;
@@ -94,7 +95,7 @@ class Ingester {
         ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, chunkSource);
         Map<String, String> fields = getContentFields(chunkSource);
         try {
-            ingest(bcs, fields, chunkBytes.length);
+            indexContentStream(bcs, fields, chunkBytes.length);
         } catch (Exception ex) {
             throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkID), ex);
         }
@@ -114,8 +115,9 @@ class Ingester {
      *                           file, but the Solr server is probably fine.
      */
     void indexMetaDataOnly(AbstractFile file) throws IngesterException {
-        ingest(new NullContentStream(file), getContentFields(file), 0);
+        indexContentStream(new NullContentStream(file), getContentFields(file), 0);
     }
+
     /**
      * Sends a TextExtractor to Solr to have its content extracted and added to
      * the index. commit() should be called once you're done ingesting files.
@@ -131,7 +133,7 @@ class Ingester {
     void recordNumberOfChunks(AbstractFile file, int numChunks) throws IngesterException {
         Map<String, String> params = getContentFields(file);
         params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
-        ingest(new NullContentStream(file), params, 0);
+        indexContentStream(new NullContentStream(file), params, 0);
     }
 
     /**
@@ -220,6 +222,109 @@ class Ingester {
         }
     }
 
+    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
+    private static final int SINGLE_READ_CHARS = 1024;
+    private static final int EXTRA_CHARS = 128; //for whitespace
+
+    public <T> boolean chunkText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
+        int numChunks = 0; //unknown until indexing is done
+
+        if (extractor.noExtractionOptionsAreEnabled()) {
+            return true;
+        }
+        T appendix = extractor.newAppendixProvider();
+        try (final InputStream stream = extractor.getInputStream(sourceFile);
+                Reader reader = extractor.getReader(stream, sourceFile, appendix);) {
+
+            //we read max 1024 chars at time, this seems to max what this Reader would return
+            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
+            long readSize;
+            boolean eof = false;
+            while (!eof) {
+                int totalRead = 0;
+                if (context.fileIngestIsCancelled()) {
+                    return true;
+                }
+                if ((readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) == -1) {
+                    eof = true;
+                } else {
+                    totalRead += readSize;
+                }
+
+                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
+                while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
+                        && (readSize = reader.read(textChunkBuf, totalRead, SINGLE_READ_CHARS)) != -1) {
+                    totalRead += readSize;
+                }
+                if (readSize == -1) {
+                    //this is the last chunk
+                    eof = true;
+                } else {
+                    //try to read char-by-char until whitespace to not break words
+                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
+                            && !Character.isWhitespace(textChunkBuf[totalRead - 1])
+                            && (readSize = reader.read(textChunkBuf, totalRead, 1)) != -1) {
+                        totalRead += readSize;
+                    }
+                    if (readSize == -1) {
+                        //this is the last chunk
+                        eof = true;
+                    }
+                }
+
+                StringBuilder sb = new StringBuilder(totalRead + 1000)
+                        .append(textChunkBuf, 0, totalRead);
+
+                if (eof) {
+                    extractor.appendDataToFinalChunk(sb, appendix);
+                }
+                sanitizeToUTF8(sb);
+
+                final String chunkString = sb.toString();
+
+                //encode to bytes as UTF-8 to index as byte stream
+                byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+                String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
+                try {
+                    indexChunk(sourceFile, encodedBytes, encodedBytes.length, chunkId);
+                    numChunks++;
+                } catch (Ingester.IngesterException ingEx) {
+                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS
+
+                    throw ingEx; //need to rethrow to signal error and move on
+                }
+            }
+        } catch (IOException ex) {
+            extractor.logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            return false;
+        } catch (Exception ex) {
+            extractor.logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            return false;
+        } finally {
+            //after all chunks, ingest the parent file without content itself, and store numChunks
+            recordNumberOfChunks(sourceFile, numChunks);
+        }
+        return true;
+    }
+
+    /**
+     * Sanitize the given chars by replacing non-UTF-8 characters with caret '^'
+     *
+     * @param totalRead    the number of chars in textChunkBuf
+     * @param textChunkBuf the characters to sanitize
+     */
+    static void sanitizeToUTF8(StringBuilder sb) {
+        final int length = sb.length();
+
+        // Sanitize by replacing non-UTF-8 characters with caret '^'
+        for (int i = 0; i < length; i++) {
+            if (!TextUtil.isValidSolrUTF8(sb.charAt(i))) {
+                sb.replace(i, i + 1, "^'");
+            }
+        }
+    }
+
     /**
      * Indexing method that bypasses Tika, assumes pure text It reads and
      * converts the entire content stream to string, assuming UTF8 since we
@@ -236,7 +341,7 @@ class Ingester {
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    void ingest(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
+    void indexContentStream(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
             //skip the file, image id unknown
             String msg = NbBundle.getMessage(this.getClass(),
@@ -347,7 +452,6 @@ class Ingester {
         }
     }
 
-
     /**
      * ContentStream associated with FsContent, but forced with no content
      */
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index c3b997fa58..646cc199b5 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2011-2015 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private List<TextExtractor> textExtractors;
+    private List<TextExtractor<?>> textExtractors;
     private StringsTextExtractor stringExtractor;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
@@ -415,24 +415,24 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            TextExtractor fileExtract = null;
+            TextExtractor extractor = null;
 
             //go over available text extractors in order, and pick the first one (most specific one)
             for (TextExtractor fe : textExtractors) {
                 if (fe.isSupported(aFile, detectedFormat)) {
-                    fileExtract = fe;
+                    extractor = fe;
                     break;
                 }
             }
 
-            if (fileExtract == null) {
+            if (extractor == null) {
                 logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
                 return false;
             }
 
             //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
             //divide into chunks and index
-            return fileExtract.index(aFile, context);
+            return Ingester.getDefault().chunkText(extractor, aFile, context);
         }
 
         /**
@@ -448,7 +448,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 if (context.fileIngestIsCancelled()) {
                     return true;
                 }
-                if (stringExtractor.index(aFile, KeywordSearchIngestModule.this.context)) {
+                if (Ingester.getDefault().chunkText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                     return true;
                 } else {
@@ -463,26 +463,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             }
         }
 
-        /**
-         * Check with every extractor if it supports the file with the detected
-         * format
-         *
-         * @param aFile          file to check for
-         * @param detectedFormat mime-type with detected format (such as
-         *                       text/plain) or null if not detected
-         *
-         * @return true if text extraction is supported
-         */
-        private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
-            for (TextExtractor extractor : textExtractors) {
-                if (extractor.isContentTypeSpecific() == true
-                        && extractor.isSupported(aFile, detectedFormat)) {
-                    return true;
-                }
-            }
-            return false;
-        }
-
         /**
          * Adds the file to the index. Detects file type, calls extractors, etc.
          *
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index aa0bb5d830..546614424a 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -143,7 +143,7 @@ public class SolrSearchService implements KeywordSearchService {
         solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
 
         try {
-            Ingester.getDefault().ingest(new StringStream(""), solrFields, 0);
+            Ingester.getDefault().indexContentStream(new StringStream(""), solrFields, 0);
         } catch (Ingester.IngesterException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
@@ -159,7 +159,7 @@ public class SolrSearchService implements KeywordSearchService {
         StringStream contentStream = new StringStream(artifactContents.toString());
 
         try {
-            Ingester.getDefault().ingest(contentStream, solrFields, contentStream.getSize());
+            Ingester.getDefault().indexContentStream(contentStream, solrFields, contentStream.getSize());
         } catch (Ingester.IngesterException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
index 93c6c786fa..80e58bb724 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskException;
  * with the original source file) up to 1MB then and indexes chunks as text with
  * Solr.
  */
-class StringsTextExtractor extends TextExtractorBase<Void> {
+class StringsTextExtractor extends TextExtractor<Void> {
 
     private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index dfd9880ae1..c0068f2609 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -18,18 +18,19 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import java.io.InputStream;
+import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.datamodel.AbstractFile;
 
 /**
  * Common methods for utilities that extract text and content and divide into
  * chunks
  */
-interface TextExtractor {
+abstract class TextExtractor<AppendixProvider> {
 
-    default Ingester getIngester() {
+    Ingester getIngester() {
         return Server.getIngester();
     }
 
@@ -85,18 +86,16 @@ interface TextExtractor {
                     "application/x-z", //NON-NLS
                     "application/x-compress"); //NON-NLS
 
-
-    /**
-     * Index the Abstract File
-     *
-     * @param sourceFile file to index
-     *
-     * @return true if indexed successfully, false otherwise
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-     */
-    boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;
-
+//    /**
+//     * Index the Abstract File
+//     *
+//     * @param sourceFile file to index
+//     *
+//     * @return true if indexed successfully, false otherwise
+//     *
+//     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+//     */
+//    boolean chunkText(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;
     /**
      * Determines if the extractor works only for specified types is
      * supportedTypes() or whether is a generic content extractor (such as
@@ -104,7 +103,7 @@ interface TextExtractor {
      *
      * @return
      */
-    boolean isContentTypeSpecific();
+    abstract boolean isContentTypeSpecific();
 
     /**
      * Determines if the file content is supported by the extractor if
@@ -116,5 +115,19 @@ interface TextExtractor {
      *
      * @return true if the file content is supported, false otherwise
      */
-    boolean isSupported(AbstractFile file, String detectedFormat);
+    abstract boolean isSupported(AbstractFile file, String detectedFormat);
+
+    abstract boolean noExtractionOptionsAreEnabled();
+
+    abstract void logWarning(final String msg, Exception ex);
+
+    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
+        //no-op
+    }
+
+    abstract AppendixProvider newAppendixProvider();
+
+    abstract InputStream getInputStream(AbstractFile sourceFile1);
+
+    abstract Reader getReader(InputStream stream, AbstractFile sourceFile, AppendixProvider appendix) throws Ingester.IngesterException;
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java
deleted file mode 100644
index 5e956e4e63..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-2016 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import org.sleuthkit.autopsy.coreutils.TextUtil;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
-import org.sleuthkit.datamodel.AbstractFile;
-
-public abstract class TextExtractorBase<AppendixProvder> implements TextExtractor {
-
-    static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
-    static final int SINGLE_READ_CHARS = 1024;
-    static final int EXTRA_CHARS = 128; //for whitespace
-    static final int MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024;  //in bytes
-
-
-    abstract void logWarning(final String msg, Exception ex);
-
-    void appendDataToFinalChunk(StringBuilder sb, AppendixProvder dataProvider) {
-        //no-op
-    }
-
-    /**
-     * Sanitize the given chars by replacing non-UTF-8 characters with caret '^'
-     *
-     * @param totalRead    the number of chars in textChunkBuf
-     * @param textChunkBuf the characters to sanitize
-     */
-    static void sanitizeToUTF8(StringBuilder sb) {
-        final int length = sb.length();
-
-        // Sanitize by replacing non-UTF-8 characters with caret '^'
-        for (int i = 0; i < length; i++) {
-            if (!TextUtil.isValidSolrUTF8(sb.charAt(i))) {
-                sb.replace(i, i + 1, "^'");
-            }
-        }
-    }
-
-    abstract boolean noExtractionOptionsAreEnabled();
-
-    abstract AppendixProvder newAppendixProvider();
-
-    @Override
-    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
-        int numChunks = 0; //unknown until indexing is done
-
-        if (noExtractionOptionsAreEnabled()) {
-            return true;
-        }
-        AppendixProvder appendix = newAppendixProvider();
-        try (final InputStream stream = getInputStream(sourceFile);
-                Reader reader = getReader(stream, sourceFile, appendix);) {
-
-            //we read max 1024 chars at time, this seems to max what this Reader would return
-            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
-            long readSize;
-            boolean eof = false;
-            while (!eof) {
-                int totalRead = 0;
-                if (context.fileIngestIsCancelled()) {
-                    return true;
-                }
-                if ((readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) == -1) {
-                    eof = true;
-                } else {
-                    totalRead += readSize;
-                }
-
-                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
-                while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
-                        && (readSize = reader.read(textChunkBuf, totalRead, SINGLE_READ_CHARS)) != -1) {
-                    totalRead += readSize;
-                }
-                if (readSize == -1) {
-                    //this is the last chunk
-                    eof = true;
-                } else {
-                    //try to read char-by-char until whitespace to not break words
-                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
-                            && !Character.isWhitespace(textChunkBuf[totalRead - 1])
-                            && (readSize = reader.read(textChunkBuf, totalRead, 1)) != -1) {
-                        totalRead += readSize;
-                    }
-                    if (readSize == -1) {
-                        //this is the last chunk
-                        eof = true;
-                    }
-                }
-
-                StringBuilder sb = new StringBuilder(totalRead + 1000)
-                        .append(textChunkBuf, 0, totalRead);
-
-                if (eof) {
-                    appendDataToFinalChunk(sb, appendix);
-                }
-                sanitizeToUTF8(sb);
-
-                final String chunkString = sb.toString();
-
-                //encode to bytes as UTF-8 to index as byte stream
-                byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
-                String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
-                try {
-                    getIngester().indexChunk(sourceFile, encodedBytes, encodedBytes.length, chunkId);
-                    numChunks++;
-                } catch (Ingester.IngesterException ingEx) {
-                    logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
-                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS
-
-                    throw ingEx; //need to rethrow to signal error and move on
-                }
-            }
-        } catch (IOException ex) {
-            logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
-            return false;
-        } catch (Exception ex) {
-            logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
-            return false;
-        } finally {
-            //after all chunks, ingest the parent file without content itself, and store numChunks
-            getIngester().recordNumberOfChunks(sourceFile, numChunks);
-        }
-        return true;
-    }
-
-    abstract InputStream getInputStream(AbstractFile sourceFile1);
-
-    abstract Reader getReader(InputStream stream, AbstractFile sourceFile, AppendixProvder appendix) throws Ingester.IngesterException;
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
index 8df551dcf6..7f84ece42b 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@@ -49,7 +49,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * parsers-supported content type.
  *
  */
-class TikaTextExtractor extends TextExtractorBase<Metadata> {
+class TikaTextExtractor extends TextExtractor<Metadata> {
 
     private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;

From 359dc16ee5f304eac69ea90951c93ce4c6d2114e Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Mon, 12 Dec 2016 16:52:02 +0100
Subject: [PATCH 04/21] inline indexChunk

---
 .../autopsy/keywordsearch/Ingester.java       | 37 ++++++-------------
 .../KeywordSearchIngestModule.java            |  4 +-
 2 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 60e6657cf5..9a16fb8a1f 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -82,25 +82,6 @@ class Ingester {
         }
     }
 
-    /**
-     * Indexes the text chunk.
-     *
-     * @param ingester   An Ingester to do the indexing.
-     * @param chunkBytes The raw bytes of the text chunk.
-     * @param chunkSize  The size of the text chunk in bytes.
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-     */
-    void indexChunk(AbstractFile chunkSource, byte[] chunkBytes, long chunkSize, String chunkID) throws IngesterException {
-        ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, chunkSource);
-        Map<String, String> fields = getContentFields(chunkSource);
-        try {
-            indexContentStream(bcs, fields, chunkBytes.length);
-        } catch (Exception ex) {
-            throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkID), ex);
-        }
-    }
-
     /**
      * Sends a file to Solr to have its content extracted and added to the
      * index. commit() should be called once you're done ingesting files. If the
@@ -130,7 +111,7 @@ class Ingester {
      * @throws IngesterException if there was an error processing a specific
      *                           file, but the Solr server is probably fine.
      */
-    void recordNumberOfChunks(AbstractFile file, int numChunks) throws IngesterException {
+    private void recordNumberOfChunks(AbstractFile file, int numChunks) throws IngesterException {
         Map<String, String> params = getContentFields(file);
         params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
         indexContentStream(new NullContentStream(file), params, 0);
@@ -226,8 +207,8 @@ class Ingester {
     private static final int SINGLE_READ_CHARS = 1024;
     private static final int EXTRA_CHARS = 128; //for whitespace
 
-    public <T> boolean chunkText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
-        int numChunks = 0; //unknown until indexing is done
+    public <T> boolean indexText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
+        int numChunks = 0; //unknown until chunking is done
 
         if (extractor.noExtractionOptionsAreEnabled()) {
             return true;
@@ -286,7 +267,13 @@ class Ingester {
                 byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
                 String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
                 try {
-                    indexChunk(sourceFile, encodedBytes, encodedBytes.length, chunkId);
+                    ByteContentStream bcs = new ByteContentStream(encodedBytes, encodedBytes.length, sourceFile);
+                    Map<String, String> fields = getContentFields(sourceFile);
+                    try {
+                        indexContentStream(bcs, fields, encodedBytes.length);
+                    } catch (Exception ex) {
+                        throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
+                    }
                     numChunks++;
                 } catch (Ingester.IngesterException ingEx) {
                     extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
@@ -314,7 +301,7 @@ class Ingester {
      * @param totalRead    the number of chars in textChunkBuf
      * @param textChunkBuf the characters to sanitize
      */
-    static void sanitizeToUTF8(StringBuilder sb) {
+    private static void sanitizeToUTF8(StringBuilder sb) {
         final int length = sb.length();
 
         // Sanitize by replacing non-UTF-8 characters with caret '^'
@@ -341,7 +328,7 @@ class Ingester {
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    void indexContentStream(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
+    private void indexContentStream(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
             //skip the file, image id unknown
             String msg = NbBundle.getMessage(this.getClass(),
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 646cc199b5..aa4b1ef826 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -432,7 +432,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
 
             //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
             //divide into chunks and index
-            return Ingester.getDefault().chunkText(extractor, aFile, context);
+            return Ingester.getDefault().indexText(extractor, aFile, context);
         }
 
         /**
@@ -448,7 +448,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 if (context.fileIngestIsCancelled()) {
                     return true;
                 }
-                if (Ingester.getDefault().chunkText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
+                if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                     return true;
                 } else {

From 1a70a4e8b298116a596830a7e58094d6249f9af7 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Mon, 12 Dec 2016 17:06:58 +0100
Subject: [PATCH 05/21] introduce ArtifactExtractor

---
 .../keywordsearch/ArtifactExtractor.java      | 39 +++++++++++++++++++
 .../autopsy/keywordsearch/Ingester.java       |  2 +-
 .../keywordsearch/SolrSearchService.java      | 18 ++++-----
 .../autopsy/keywordsearch/TextExtractor.java  | 29 +-------------
 .../autopsy/keywordsearch/TextProvider.java   | 39 +++++++++++++++++++
 5 files changed, 89 insertions(+), 38 deletions(-)
 create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
 create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
new file mode 100644
index 0000000000..f91ea5bca0
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
@@ -0,0 +1,39 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.InputStream;
+import java.io.Reader;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+
+public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
+
+    @Override
+    boolean noExtractionOptionsAreEnabled() {
+        return false;
+    }
+
+    @Override
+    void logWarning(String msg, Exception ex) {
+        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+    }
+
+    @Override
+    Void newAppendixProvider() {
+        return null;
+    }
+
+    @Override
+    InputStream getInputStream(BlackboardArtifact source) {
+        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+    }
+
+    @Override
+    Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
+        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+    }
+
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 9a16fb8a1f..8a804569a1 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -328,7 +328,7 @@ class Ingester {
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    private void indexContentStream(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
+    void indexContentStream(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
             //skip the file, image id unknown
             String msg = NbBundle.getMessage(this.getClass(),
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 546614424a..b55009cc25 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2015 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,24 +19,24 @@
 package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.IOException;
+import java.net.InetAddress;
 import java.util.HashMap;
+import java.util.MissingResourceException;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
-import org.sleuthkit.datamodel.BlackboardArtifact;
-import org.sleuthkit.datamodel.BlackboardAttribute;
-import org.sleuthkit.datamodel.TskCoreException;
-import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.apache.solr.common.util.ContentStreamBase.StringStream;
+import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
+import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.SleuthkitCase;
-import org.openide.util.NbBundle;
-import java.net.InetAddress;
-import java.util.MissingResourceException;
-import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
+import org.sleuthkit.datamodel.TskCoreException;
 
 /**
  * An implementation of the KeywordSearchService interface that uses Solr for
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index c0068f2609..4d6315474b 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -18,8 +18,6 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import java.io.InputStream;
-import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
@@ -28,11 +26,7 @@ import org.sleuthkit.datamodel.AbstractFile;
  * Common methods for utilities that extract text and content and divide into
  * chunks
  */
-abstract class TextExtractor<AppendixProvider> {
-
-    Ingester getIngester() {
-        return Server.getIngester();
-    }
+abstract class TextExtractor<AppendixProvider> extends TextProvider<AppendixProvider, AbstractFile> {
 
     /**
      * Common options that can be used by some extractors
@@ -86,16 +80,6 @@ abstract class TextExtractor<AppendixProvider> {
                     "application/x-z", //NON-NLS
                     "application/x-compress"); //NON-NLS
 
-//    /**
-//     * Index the Abstract File
-//     *
-//     * @param sourceFile file to index
-//     *
-//     * @return true if indexed successfully, false otherwise
-//     *
-//     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-//     */
-//    boolean chunkText(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;
     /**
      * Determines if the extractor works only for specified types is
      * supportedTypes() or whether is a generic content extractor (such as
@@ -117,17 +101,6 @@ abstract class TextExtractor<AppendixProvider> {
      */
     abstract boolean isSupported(AbstractFile file, String detectedFormat);
 
-    abstract boolean noExtractionOptionsAreEnabled();
 
-    abstract void logWarning(final String msg, Exception ex);
 
-    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
-        //no-op
-    }
-
-    abstract AppendixProvider newAppendixProvider();
-
-    abstract InputStream getInputStream(AbstractFile sourceFile1);
-
-    abstract Reader getReader(InputStream stream, AbstractFile sourceFile, AppendixProvider appendix) throws Ingester.IngesterException;
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java
new file mode 100644
index 0000000000..5808293edd
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java
@@ -0,0 +1,39 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-16 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.InputStream;
+import java.io.Reader;
+
+abstract class TextProvider<AppendixProvider, TextSource> {
+
+    abstract boolean noExtractionOptionsAreEnabled();
+
+    abstract void logWarning(final String msg, Exception ex);
+
+    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
+        //no-op
+    }
+
+    abstract AppendixProvider newAppendixProvider();
+
+    abstract InputStream getInputStream(TextSource source);
+
+    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
+}

From 85af7c57b653e55d99a8c2a3adba82c145ea9d2f Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Tue, 13 Dec 2016 00:02:03 +0100
Subject: [PATCH 06/21] build out ArtifactExtractor

---
 .../keywordsearch/ArtifactExtractor.java      | 135 +++++++++++++++++-
 .../keywordsearch/ByteArtifactStream.java     | 100 +++++++++++++
 .../keywordsearch/FileTextExtractor.java      | 124 ++++++++++++++++
 .../keywordsearch/HtmlTextExtractor.java      |   2 +-
 .../autopsy/keywordsearch/Ingester.java       | 118 ++++++++++++---
 ...wordSearchGlobalLanguageSettingsPanel.java |  12 +-
 .../KeywordSearchIngestModule.java            |   8 +-
 .../KeywordSearchJobSettingsPanel.java        |   4 +-
 .../keywordsearch/KeywordSearchSettings.java  |   8 +-
 .../keywordsearch/SolrSearchService.java      | 107 +-------------
 .../keywordsearch/StringsTextExtractor.java   |  10 +-
 .../autopsy/keywordsearch/TextExtractor.java  |  95 +++---------
 .../autopsy/keywordsearch/TextProvider.java   |  39 -----
 .../keywordsearch/TikaTextExtractor.java      |   8 +-
 14 files changed, 503 insertions(+), 267 deletions(-)
 create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
 create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
index f91ea5bca0..629c71936f 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
@@ -6,10 +6,51 @@
 package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.HashMap;
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.common.util.ContentStream;
+import org.openide.util.Exceptions;
+import org.sleuthkit.autopsy.casemodule.Case;
+import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.BlackboardAttribute;
+import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.SleuthkitCase;
+import org.sleuthkit.datamodel.TskCoreException;
 
-public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
+public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
+
+    static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
+        Content dataSource;
+        Case currentCase;
+        try {
+            currentCase = Case.getCurrentCase();
+        } catch (IllegalStateException ignore) {
+            // thorown by Case.getCurrentCase() if currentCase is null
+            return null;
+        }
+
+        SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
+        if (sleuthkitCase == null) {
+            return null;
+        }
+
+        AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
+        if (abstractFile != null) {
+
+            dataSource = abstractFile.getDataSource();
+        } else {
+            dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
+        }
+
+        if (dataSource == null) {
+            return null;
+        }
+        return dataSource;
+    }
 
     @Override
     boolean noExtractionOptionsAreEnabled() {
@@ -27,13 +68,99 @@ public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
     }
 
     @Override
-    InputStream getInputStream(BlackboardArtifact source) {
-        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+    InputStream getInputStream(BlackboardArtifact artifact) {
+
+        // Concatenate the string values of all attributes into a single
+        // "content" string to be indexed.
+        StringBuilder artifactContents = new StringBuilder();
+        Content dataSource;
+        try {
+            dataSource = getDataSource(artifact);
+            if (dataSource == null) {
+                return null;
+            }
+
+            for (BlackboardAttribute attribute : artifact.getAttributes()) {
+                artifactContents.append(attribute.getAttributeType().getDisplayName());
+                artifactContents.append(" : ");
+
+                // This is ugly since it will need to updated any time a new
+                // TSK_DATETIME_* attribute is added. A slightly less ugly
+                // alternative would be to assume that all date time attributes
+                // will have a name of the form "TSK_DATETIME*" and check
+                // attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
+                // The major problem with that approach is that it would require
+                // a round trip to the database to get the type name string.
+                // We have also discussed modifying BlackboardAttribute.getDisplayString()
+                // to magically format datetime attributes but that is complicated by
+                // the fact that BlackboardAttribute exists in Sleuthkit data model
+                // while the utility to determine the timezone to use is in ContentUtils
+                // in the Autopsy datamodel.
+                if (attribute.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
+
+                    artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
+                } else {
+                    artifactContents.append(attribute.getDisplayString());
+                }
+                artifactContents.append(System.lineSeparator());
+            }
+        } catch (TskCoreException ex) {
+            Exceptions.printStackTrace(ex);
+            return null;
+        }
+        if (artifactContents.length() == 0) {
+            return null;
+        }
+
+        // To play by the rules of the existing text markup implementations,
+        // we need to (a) index the artifact contents in a "chunk" and
+        // (b) create a separate index entry for the base artifact.
+        // We distinguish artifact content from file content by applying a
+        // mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
+        // First, create an index entry for the base artifact.
+        HashMap<String, String> solrFields = new HashMap<>();
+        String documentId = Long.toString(artifact.getArtifactID());
+
+        solrFields.put(Server.Schema.ID.toString(), documentId);
+
+        // Set the IMAGE_ID field.
+        solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
+
+        // Next create the index entry for the document content.
+        // The content gets added to a single chunk. We may need to add chunking
+        // support later.
+        long chunkId = 1;
+
+        documentId += "_" + Long.toString(chunkId);
+        solrFields.replace(Server.Schema.ID.toString(), documentId);
+
+        return IOUtils.toInputStream(artifactContents);
+
     }
 
     @Override
     Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
-        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+        return new InputStreamReader(stream);
+    }
+
+    @Override
+    long getID(BlackboardArtifact source) {
+        return source.getArtifactID();
+    }
+
+    @Override
+    ContentStream getContentStream(byte[] encodedBytes, int length, BlackboardArtifact source) {
+        return new ByteArtifactStream(encodedBytes, length, source);
+    }
+
+    @Override
+    ContentStream getNullStream(BlackboardArtifact source) {
+        return new Ingester.NullArtifactStream(source);
+    }
+
+    @Override
+    String getName(BlackboardArtifact source) {
+        return source.getDisplayName();
     }
 
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
new file mode 100644
index 0000000000..7e4898d185
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
@@ -0,0 +1,100 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import org.apache.solr.common.util.ContentStream;
+import org.openide.util.NbBundle;
+import org.sleuthkit.autopsy.coreutils.Logger;
+import static org.sleuthkit.autopsy.keywordsearch.Bundle.*;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+
+/**
+ * Stream of bytes representing string with specified encoding to feed into Solr
+ * as ContentStream
+ */
+class ByteArtifactStream implements ContentStream {
+
+    //input
+    private final byte[] content; //extracted subcontent
+    private long contentSize;
+    private final BlackboardArtifact aContent; //origin
+
+    private final InputStream stream;
+
+    private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
+
+    public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
+        this.content = content;
+        this.aContent = aContent;
+        stream = new ByteArrayInputStream(content, 0, (int) contentSize);
+    }
+
+    public byte[] getByteContent() {
+        return content;
+    }
+
+    public BlackboardArtifact getSourceContent() {
+        return aContent;
+    }
+
+    @Override
+    public String getContentType() {
+        return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
+    }
+
+    @Override
+    public String getName() {
+        return aContent.getDisplayName();
+    }
+
+    @Override
+    public Reader getReader() throws IOException {
+        return new InputStreamReader(stream);
+
+    }
+
+    @Override
+    public Long getSize() {
+        return contentSize;
+    }
+
+    @Override
+    @NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
+    public String getSourceInfo() {
+        return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
+    }
+
+    @Override
+    public InputStream getStream() throws IOException {
+        return stream;
+    }
+
+    @Override
+    protected void finalize() throws Throwable {
+        super.finalize();
+
+        stream.close();
+    }
+
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
new file mode 100644
index 0000000000..e30ea764ea
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@@ -0,0 +1,124 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2016 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.util.Arrays;
+import java.util.List;
+import org.apache.solr.common.util.ContentStream;
+import org.sleuthkit.datamodel.AbstractFile;
+
+/**
+ * Common methods for utilities that extract text and content and divide into
+ * chunks
+ */
+abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
+
+    /**
+     * Common options that can be used by some extractors
+     */
+    enum ExtractOptions {
+
+        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
+        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
+    };
+
+    static final List<String> BLOB_MIME_TYPES
+            = Arrays.asList(
+                    //ignore binary blob data, for which string extraction will be used
+                    "application/octet-stream", //NON-NLS
+                    "application/x-msdownload"); //NON-NLS
+
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
+    static final List<String> ARCHIVE_MIME_TYPES
+            = Arrays.asList(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+
+    /**
+     * Determines if the extractor works only for specified types is
+     * supportedTypes() or whether is a generic content extractor (such as
+     * string extractor)
+     *
+     * @return
+     */
+    abstract boolean isContentTypeSpecific();
+
+    /**
+     * Determines if the file content is supported by the extractor if
+     * isContentTypeSpecific() returns true.
+     *
+     * @param file           to test if its content should be supported
+     * @param detectedFormat mime-type with detected format (such as text/plain)
+     *                       or null if not detected
+     *
+     * @return true if the file content is supported, false otherwise
+     */
+    abstract boolean isSupported(AbstractFile file, String detectedFormat);
+
+    @Override
+    long getID(AbstractFile source) {
+        return source.getId();
+    }
+
+    @Override
+    ContentStream getContentStream(byte[] encodedBytes, int length, AbstractFile source) {
+        return new ByteContentStream(encodedBytes, length, source);
+    }
+
+    @Override
+    ContentStream getNullStream(AbstractFile source) {
+        return new Ingester.NullContentStream(source);
+    }
+
+    @Override
+    String getName(AbstractFile source) {
+        return source.getName();
+    }
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
index 07dde1c818..5387dd7619 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * divided into chunks and indexed with Solr. If HTML extraction succeeds,
  * chunks are indexed with Solr.
  */
-class HtmlTextExtractor extends TextExtractor<Void> {
+class HtmlTextExtractor extends FileTextExtractor<Void> {
 
     private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 8a804569a1..d46eafa802 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -36,14 +36,16 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.datamodel.AbstractContent;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.ContentVisitor;
 import org.sleuthkit.datamodel.DerivedFile;
 import org.sleuthkit.datamodel.Directory;
 import org.sleuthkit.datamodel.File;
 import org.sleuthkit.datamodel.LayoutFile;
 import org.sleuthkit.datamodel.LocalFile;
 import org.sleuthkit.datamodel.SlackFile;
+import org.sleuthkit.datamodel.SleuthkitItemVisitor;
+import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 import org.sleuthkit.datamodel.TskCoreException;
 
 /**
@@ -99,6 +101,11 @@ class Ingester {
         indexContentStream(new NullContentStream(file), getContentFields(file), 0);
     }
 
+    void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
+
+//        indexContentStream(new NullContentStream(artifact), getContentFields(file), 0);
+    }
+
     /**
      * Sends a TextExtractor to Solr to have its content extracted and added to
      * the index. commit() should be called once you're done ingesting files.
@@ -117,6 +124,12 @@ class Ingester {
         indexContentStream(new NullContentStream(file), params, 0);
     }
 
+    private void recordNumberOfChunks(BlackboardArtifact artifact, int numChunks) throws IngesterException {
+        Map<String, String> params = getContentFields(artifact);
+        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
+        indexContentStream(new NullArtifactStream(artifact), params, 0);
+    }
+
     /**
      * Creates a field map from FsContent, that is later sent to Solr
      *
@@ -124,19 +137,14 @@ class Ingester {
      *
      * @return the map
      */
-    Map<String, String> getContentFields(AbstractContent fsc) {
+    Map<String, String> getContentFields(SleuthkitVisitableItem fsc) {
         return fsc.accept(getContentFieldsV);
     }
 
     /**
      * Visitor used to create param list to send to SOLR index.
      */
-    static private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
-
-        @Override
-        protected Map<String, String> defaultVisit(Content cntnt) {
-            return new HashMap<>();
-        }
+    static private class GetContentFieldsV extends SleuthkitItemVisitor.Default<Map<String, String>> {
 
         @Override
         public Map<String, String> visit(File f) {
@@ -201,21 +209,46 @@ class Ingester {
             params.put(Server.Schema.FILE_NAME.toString(), af.getName());
             return params;
         }
+
+        @Override
+        public Map<String, String> visit(BlackboardArtifact artifact) {
+
+            Map<String, String> params = new HashMap<>();
+            params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
+            try {
+                Content dataSource = ArtifactExtractor.getDataSource(artifact);
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
+            } catch (TskCoreException ex) {
+                logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
+            }
+
+            return params;
+        }
+
+        @Override
+        protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
+            return new HashMap<>();
+        }
     }
 
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
     private static final int SINGLE_READ_CHARS = 1024;
     private static final int EXTRA_CHARS = 128; //for whitespace
 
-    public <T> boolean indexText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
+    public <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
         int numChunks = 0; //unknown until chunking is done
 
         if (extractor.noExtractionOptionsAreEnabled()) {
             return true;
         }
-        T appendix = extractor.newAppendixProvider();
-        try (final InputStream stream = extractor.getInputStream(sourceFile);
-                Reader reader = extractor.getReader(stream, sourceFile, appendix);) {
+        final long sourceID = extractor.getID(source);
+        final String sourceName = extractor.getName(source);
+        Map<String, String> fields = getContentFields(source);
+
+        A appendix = extractor.newAppendixProvider();
+        try (final InputStream stream = extractor.getInputStream(source);
+                Reader reader = extractor.getReader(stream, source, appendix);) {
 
             //we read max 1024 chars at time, this seems to max what this Reader would return
             char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
@@ -265,10 +298,10 @@ class Ingester {
 
                 //encode to bytes as UTF-8 to index as byte stream
                 byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
-                String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
+
+                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                 try {
-                    ByteContentStream bcs = new ByteContentStream(encodedBytes, encodedBytes.length, sourceFile);
-                    Map<String, String> fields = getContentFields(sourceFile);
+                    ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source);
                     try {
                         indexContentStream(bcs, fields, encodedBytes.length);
                     } catch (Exception ex) {
@@ -277,20 +310,21 @@ class Ingester {
                     numChunks++;
                 } catch (Ingester.IngesterException ingEx) {
                     extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
-                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS
+                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
 
                     throw ingEx; //need to rethrow to signal error and move on
                 }
             }
         } catch (IOException ex) {
-            extractor.logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
             return false;
         } catch (Exception ex) {
-            extractor.logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
             return false;
         } finally {
             //after all chunks, ingest the parent file without content itself, and store numChunks
-            recordNumberOfChunks(sourceFile, numChunks);
+            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
+            indexContentStream(extractor.getNullStream(source), fields, 0);
         }
         return true;
     }
@@ -442,7 +476,7 @@ class Ingester {
     /**
      * ContentStream associated with FsContent, but forced with no content
      */
-    private static class NullContentStream implements ContentStream {
+    static class NullContentStream implements ContentStream {
 
         AbstractContent aContent;
 
@@ -482,6 +516,50 @@ class Ingester {
         }
     }
 
+    /**
+     * ContentStream associated with Artifact, but forced with no content
+     */
+    static class NullArtifactStream implements ContentStream {
+
+        BlackboardArtifact aContent;
+
+        NullArtifactStream(BlackboardArtifact aContent) {
+            this.aContent = aContent;
+        }
+
+        @Override
+        public String getName() {
+            return aContent.getDisplayName();
+        }
+
+        @NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n")
+        @Override
+        public String getSourceInfo() {
+            return Bundle.Ingester_NullArtifactStream_getSrcInfo_text(aContent.getArtifactID());
+        }
+
+        @Override
+        public String getContentType() {
+            return null;
+        }
+
+        @Override
+        public Long getSize() {
+            return 0L;
+        }
+
+        @Override
+        public InputStream getStream() throws IOException {
+            return new ByteArrayInputStream(new byte[0]);
+        }
+
+        @Override
+        public Reader getReader() throws IOException {
+            throw new UnsupportedOperationException(
+                    NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
+        }
+    }
+
     /**
      * Indicates that there was an error with the specific ingest operation, but
      * it's still okay to continue ingesting files.
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
index 2a590e6862..c1f1e2a5a7 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
@@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     private void reloadScriptsCheckBoxes() {
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
         enableUTF8Checkbox.setSelected(utf8);
 
         final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
@@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
         reloadScriptsCheckBoxes();
 
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
         enableUTF8Checkbox.setSelected(utf8);
         final boolean extractEnabled = utf16 || utf8;
 
@@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     @Override
     public void store() {
-        KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
+        KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
                 Boolean.toString(enableUTF8Checkbox.isSelected()));
-        KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
+        KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
                 Boolean.toString(enableUTF16Checkbox.isSelected()));
 
         if (toUpdate != null) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index aa4b1ef826..fc40f0070c 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private List<TextExtractor<?>> textExtractors;
+    private List<FileTextExtractor<?>> textExtractors;
     private StringsTextExtractor stringExtractor;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
@@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            TextExtractor extractor = null;
+            FileTextExtractor extractor = null;
 
             //go over available text extractors in order, and pick the first one (most specific one)
-            for (TextExtractor fe : textExtractors) {
+            for (FileTextExtractor fe : textExtractors) {
                 if (fe.isSupported(aFile, detectedFormat)) {
                     extractor = fe;
                     break;
@@ -514,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
 
             // we skip archive formats that are opened by the archive module. 
             // @@@ We could have a check here to see if the archive module was enabled though...
-            if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
+            if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
                 try {
                     if (context.fileIngestIsCancelled()) {
                         return;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
index ccc0022996..123878e92f 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
@@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
     }
 
     private void displayEncodings() {
-        String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
-        String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
+        String utf8 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
+        String utf16 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
         ArrayList<String> encodingsList = new ArrayList<>();
         if (utf8 == null || Boolean.parseBoolean(utf8)) {
             encodingsList.add("UTF8");
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
index 14acb9ff6f..cd5702722b 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
@@ -211,14 +211,14 @@ class KeywordSearchSettings {
             KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
         }
         //setting default Extract UTF8
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
             logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
         }
         //setting default Extract UTF16
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
             logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
         }
         //setting default Latin-1 Script
         if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index b55009cc25..f1429b5849 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -20,22 +20,14 @@ package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.IOException;
 import java.net.InetAddress;
-import java.util.HashMap;
 import java.util.MissingResourceException;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
-import org.apache.solr.common.util.ContentStreamBase.StringStream;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
-import org.sleuthkit.autopsy.casemodule.Case;
-import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
-import org.sleuthkit.datamodel.BlackboardAttribute;
-import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
 
 /**
@@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
     private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
     private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
 
+    ArtifactExtractor extractor = new ArtifactExtractor();
+
     @Override
     public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
         if (artifact == null) {
@@ -57,109 +51,18 @@ public class SolrSearchService implements KeywordSearchService {
 
         // We only support artifact indexing for Autopsy versions that use
         // the negative range for artifact ids.
-        long artifactId = artifact.getArtifactID();
-
-        if (artifactId > 0) {
+        if (artifact.getArtifactID() > 0) {
             return;
         }
 
-        Case currentCase;
-        try {
-            currentCase = Case.getCurrentCase();
-        } catch (IllegalStateException ignore) {
-            // thorown by Case.getCurrentCase() if currentCase is null
-            return;
-        }
-
-        SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
-        if (sleuthkitCase == null) {
-            return;
-        }
-
-        Content dataSource;
-        AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
-        if (abstractFile != null) {
-            dataSource = abstractFile.getDataSource();
-        } else {
-            dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
-        }
-
-        if (dataSource == null) {
-            return;
-        }
-
-        // Concatenate the string values of all attributes into a single 
-        // "content" string to be indexed.
-        StringBuilder artifactContents = new StringBuilder();
-
-        for (BlackboardAttribute attribute : artifact.getAttributes()) {
-            artifactContents.append(attribute.getAttributeType().getDisplayName());
-            artifactContents.append(" : ");
-
-            // This is ugly since it will need to updated any time a new
-            // TSK_DATETIME_* attribute is added. A slightly less ugly 
-            // alternative would be to assume that all date time attributes
-            // will have a name of the form "TSK_DATETIME*" and check
-            // attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
-            // The major problem with that approach is that it would require
-            // a round trip to the database to get the type name string.
-            // We have also discussed modifying BlackboardAttribute.getDisplayString()
-            // to magically format datetime attributes but that is complicated by
-            // the fact that BlackboardAttribute exists in Sleuthkit data model
-            // while the utility to determine the timezone to use is in ContentUtils
-            // in the Autopsy datamodel.
-            if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
-
-                artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
-            } else {
-                artifactContents.append(attribute.getDisplayString());
-            }
-            artifactContents.append(System.lineSeparator());
-        }
-
-        if (artifactContents.length() == 0) {
-            return;
-        }
-
-        // To play by the rules of the existing text markup implementations,
-        // we need to (a) index the artifact contents in a "chunk" and 
-        // (b) create a separate index entry for the base artifact.
-        // We distinguish artifact content from file content by applying a 
-        // mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
-        // First, create an index entry for the base artifact.
-        HashMap<String, String> solrFields = new HashMap<>();
-        String documentId = Long.toString(artifactId);
-
-        solrFields.put(Server.Schema.ID.toString(), documentId);
-
-        // Set the IMAGE_ID field.
-        solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
-
         try {
-            Ingester.getDefault().indexContentStream(new StringStream(""), solrFields, 0);
+            Ingester.getDefault().indexMetaDataOnly(artifact);
         } catch (Ingester.IngesterException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
 
-        // Next create the index entry for the document content.
-        // The content gets added to a single chunk. We may need to add chunking
-        // support later.
-        long chunkId = 1;
-
-        documentId += "_" + Long.toString(chunkId);
-        solrFields.replace(Server.Schema.ID.toString(), documentId);
-
-        StringStream contentStream = new StringStream(artifactContents.toString());
-
         try {
-            Ingester.getDefault().indexContentStream(contentStream, solrFields, contentStream.getSize());
+            Ingester.getDefault().indexText(extractor, artifact);
         } catch (Ingester.IngesterException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
index 80e58bb724..ddc7d06362 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskException;
  * with the original source file) up to 1MB then and indexes chunks as text with
  * Solr.
  */
-class StringsTextExtractor extends TextExtractor<Void> {
+class StringsTextExtractor extends FileTextExtractor<Void> {
 
     private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
@@ -94,8 +94,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
 
     @Override
     boolean noExtractionOptionsAreEnabled() {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
         return extractUTF8 == false && extractUTF16 == false;
     }
@@ -120,8 +120,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
      */
     @Override
     InputStream getInputStream(AbstractFile sourceFile) {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
         //check which extract stream to use
         InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index 4d6315474b..bbb52582e4 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2011-2016 Basis Technology Corp.
+ * Copyright 2011-16 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,89 +18,30 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import java.util.Arrays;
-import java.util.List;
-import org.sleuthkit.datamodel.AbstractFile;
+import java.io.InputStream;
+import java.io.Reader;
+import org.apache.solr.common.util.ContentStream;
+import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 
-/**
- * Common methods for utilities that extract text and content and divide into
- * chunks
- */
-abstract class TextExtractor<AppendixProvider> extends TextProvider<AppendixProvider, AbstractFile> {
+abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
 
-    /**
-     * Common options that can be used by some extractors
-     */
-    enum ExtractOptions {
+    abstract boolean noExtractionOptionsAreEnabled();
 
-        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-    };
+    abstract void logWarning(final String msg, Exception ex);
 
-    static final List<String> BLOB_MIME_TYPES
-            = Arrays.asList(
-                    //ignore binary blob data, for which string extraction will be used
-                    "application/octet-stream", //NON-NLS
-                    "application/x-msdownload"); //NON-NLS
+    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
+        //no-op
+    }
 
-    /** generally text extractors should ignore archives and let unpacking
-     * modules take care of them */
-    static final List<String> ARCHIVE_MIME_TYPES
-            = Arrays.asList(
-                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
-                    "application/x-7z-compressed", //NON-NLS
-                    "application/x-ace-compressed", //NON-NLS
-                    "application/x-alz-compressed", //NON-NLS
-                    "application/x-arj", //NON-NLS
-                    "application/vnd.ms-cab-compressed", //NON-NLS
-                    "application/x-cfs-compressed", //NON-NLS
-                    "application/x-dgc-compressed", //NON-NLS
-                    "application/x-apple-diskimage", //NON-NLS
-                    "application/x-gca-compressed", //NON-NLS
-                    "application/x-dar", //NON-NLS
-                    "application/x-lzx", //NON-NLS
-                    "application/x-lzh", //NON-NLS
-                    "application/x-rar-compressed", //NON-NLS
-                    "application/x-stuffit", //NON-NLS
-                    "application/x-stuffitx", //NON-NLS
-                    "application/x-gtar", //NON-NLS
-                    "application/x-archive", //NON-NLS
-                    "application/x-executable", //NON-NLS
-                    "application/x-gzip", //NON-NLS
-                    "application/zip", //NON-NLS
-                    "application/x-zoo", //NON-NLS
-                    "application/x-cpio", //NON-NLS
-                    "application/x-shar", //NON-NLS
-                    "application/x-tar", //NON-NLS
-                    "application/x-bzip", //NON-NLS
-                    "application/x-bzip2", //NON-NLS
-                    "application/x-lzip", //NON-NLS
-                    "application/x-lzma", //NON-NLS
-                    "application/x-lzop", //NON-NLS
-                    "application/x-z", //NON-NLS
-                    "application/x-compress"); //NON-NLS
+    abstract AppendixProvider newAppendixProvider();
 
-    /**
-     * Determines if the extractor works only for specified types is
-     * supportedTypes() or whether is a generic content extractor (such as
-     * string extractor)
-     *
-     * @return
-     */
-    abstract boolean isContentTypeSpecific();
-
-    /**
-     * Determines if the file content is supported by the extractor if
-     * isContentTypeSpecific() returns true.
-     *
-     * @param file           to test if its content should be supported
-     * @param detectedFormat mime-type with detected format (such as text/plain)
-     *                       or null if not detected
-     *
-     * @return true if the file content is supported, false otherwise
-     */
-    abstract boolean isSupported(AbstractFile file, String detectedFormat);
+    abstract InputStream getInputStream(TextSource source);
 
+    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
 
+    abstract long getID(TextSource source);
 
+    abstract ContentStream getContentStream(byte[] encodedBytes, int length, TextSource source);
+    abstract String getName(TextSource source);
+    abstract ContentStream getNullStream(TextSource source);
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java
deleted file mode 100644
index 5808293edd..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-16 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.InputStream;
-import java.io.Reader;
-
-abstract class TextProvider<AppendixProvider, TextSource> {
-
-    abstract boolean noExtractionOptionsAreEnabled();
-
-    abstract void logWarning(final String msg, Exception ex);
-
-    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
-        //no-op
-    }
-
-    abstract AppendixProvider newAppendixProvider();
-
-    abstract InputStream getInputStream(TextSource source);
-
-    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
index 7f84ece42b..c7a4648709 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@@ -49,7 +49,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * parsers-supported content type.
  *
  */
-class TikaTextExtractor extends TextExtractor<Metadata> {
+class TikaTextExtractor extends FileTextExtractor<Metadata> {
 
     private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
@@ -110,8 +110,8 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
     @Override
     public boolean isSupported(AbstractFile file, String detectedFormat) {
         if (detectedFormat == null
-                || TextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
-                || TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
+                || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
+                || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
                 || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
                 || detectedFormat.equals("application/x-font-ttf")) {   // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
 
@@ -123,6 +123,7 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
     }
 
+    @Override
     InputStream getInputStream(AbstractFile sourceFile1) {
         return new ReadContentInputStream(sourceFile1);
     }
@@ -131,4 +132,5 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
     boolean noExtractionOptionsAreEnabled() {
         return false;
     }
+
 }

From b38171dbd72ced8204f505dba859c410b9a1eaaa Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Tue, 13 Dec 2016 00:04:53 +0100
Subject: [PATCH 07/21] make the ByteXXXStream classes inner classes of the
 TextExtractors that use them.

---
 .../keywordsearch/ArtifactExtractor.java      |  69 ++++++++++++
 .../keywordsearch/ByteArtifactStream.java     | 100 ------------------
 .../keywordsearch/ByteContentStream.java      |  98 -----------------
 .../keywordsearch/FileTextExtractor.java      |  74 +++++++++++++
 4 files changed, 143 insertions(+), 198 deletions(-)
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
index 629c71936f..72b259e1a8 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
@@ -5,6 +5,8 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
@@ -12,8 +14,11 @@ import java.util.HashMap;
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.common.util.ContentStream;
 import org.openide.util.Exceptions;
+import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.casemodule.Case;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import static org.sleuthkit.autopsy.keywordsearch.Bundle.ByteArtifactStream_getSrcInfo_text;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
@@ -163,4 +168,68 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
         return source.getDisplayName();
     }
 
+    static private class ByteArtifactStream implements ContentStream {
+
+        //input
+        private final byte[] content; //extracted subcontent
+        private long contentSize;
+        private final BlackboardArtifact aContent; //origin
+
+        private final InputStream stream;
+
+        private static final Logger logger = Logger.getLogger(ByteArtifactStream.class.getName());
+
+        public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
+            this.content = content;
+            this.aContent = aContent;
+            stream = new ByteArrayInputStream(content, 0, (int) contentSize);
+        }
+
+        public byte[] getByteContent() {
+            return content;
+        }
+
+        public BlackboardArtifact getSourceContent() {
+            return aContent;
+        }
+
+        @Override
+        public String getContentType() {
+            return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
+        }
+
+        @Override
+        public String getName() {
+            return aContent.getDisplayName();
+        }
+
+        @Override
+        public Reader getReader() throws IOException {
+            return new InputStreamReader(stream);
+
+        }
+
+        @Override
+        public Long getSize() {
+            return contentSize;
+        }
+
+        @Override
+        @NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
+        public String getSourceInfo() {
+            return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
+        }
+
+        @Override
+        public InputStream getStream() throws IOException {
+            return stream;
+        }
+
+        @Override
+        protected void finalize() throws Throwable {
+            super.finalize();
+
+            stream.close();
+        }
+    }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
deleted file mode 100644
index 7e4898d185..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import org.apache.solr.common.util.ContentStream;
-import org.openide.util.NbBundle;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import static org.sleuthkit.autopsy.keywordsearch.Bundle.*;
-import org.sleuthkit.datamodel.BlackboardArtifact;
-
-/**
- * Stream of bytes representing string with specified encoding to feed into Solr
- * as ContentStream
- */
-class ByteArtifactStream implements ContentStream {
-
-    //input
-    private final byte[] content; //extracted subcontent
-    private long contentSize;
-    private final BlackboardArtifact aContent; //origin
-
-    private final InputStream stream;
-
-    private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
-
-    public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
-        this.content = content;
-        this.aContent = aContent;
-        stream = new ByteArrayInputStream(content, 0, (int) contentSize);
-    }
-
-    public byte[] getByteContent() {
-        return content;
-    }
-
-    public BlackboardArtifact getSourceContent() {
-        return aContent;
-    }
-
-    @Override
-    public String getContentType() {
-        return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
-    }
-
-    @Override
-    public String getName() {
-        return aContent.getDisplayName();
-    }
-
-    @Override
-    public Reader getReader() throws IOException {
-        return new InputStreamReader(stream);
-
-    }
-
-    @Override
-    public Long getSize() {
-        return contentSize;
-    }
-
-    @Override
-    @NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
-    public String getSourceInfo() {
-        return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
-    }
-
-    @Override
-    public InputStream getStream() throws IOException {
-        return stream;
-    }
-
-    @Override
-    protected void finalize() throws Throwable {
-        super.finalize();
-
-        stream.close();
-    }
-
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
deleted file mode 100644
index c39e9b7bb5..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import org.apache.solr.common.util.ContentStream;
-import org.openide.util.NbBundle;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.datamodel.AbstractContent;
-
-/**
- * Stream of bytes representing string with specified encoding to feed into Solr
- * as ContentStream
- */
-class ByteContentStream implements ContentStream {
-
-    //input
-    private final byte[] content; //extracted subcontent
-    private long contentSize;
-    private final AbstractContent aContent; //origin
-
-    private final InputStream stream;
-
-    private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
-
-    public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent) {
-        this.content = content;
-        this.aContent = aContent;
-        stream = new ByteArrayInputStream(content, 0, (int) contentSize);
-    }
-
-    public byte[] getByteContent() {
-        return content;
-    }
-
-    public AbstractContent getSourceContent() {
-        return aContent;
-    }
-
-    @Override
-    public String getContentType() {
-        return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
-    }
-
-    @Override
-    public String getName() {
-        return aContent.getName();
-    }
-
-    @Override
-    public Reader getReader() throws IOException {
-        return new InputStreamReader(stream);
-
-    }
-
-    @Override
-    public Long getSize() {
-        return contentSize;
-    }
-
-    @Override
-    public String getSourceInfo() {
-        return NbBundle.getMessage(this.getClass(), "ByteContentStream.getSrcInfo.text", aContent.getId());
-    }
-
-    @Override
-    public InputStream getStream() throws IOException {
-        return stream;
-    }
-
-    @Override
-    protected void finalize() throws Throwable {
-        super.finalize();
-
-        stream.close();
-    }
-
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
index e30ea764ea..eeee49ff74 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@@ -18,9 +18,17 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.apache.solr.common.util.ContentStream;
+import org.openide.util.NbBundle;
+import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.datamodel.AbstractContent;
 import org.sleuthkit.datamodel.AbstractFile;
 
 /**
@@ -121,4 +129,70 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
     String getName(AbstractFile source) {
         return source.getName();
     }
+
+    private static class ByteContentStream implements ContentStream {
+
+        //input
+        private final byte[] content; //extracted subcontent
+        private long contentSize;
+        private final AbstractContent aContent; //origin
+
+        private final InputStream stream;
+
+        private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
+
+        public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent) {
+            this.content = content;
+            this.aContent = aContent;
+            stream = new ByteArrayInputStream(content, 0, (int) contentSize);
+        }
+
+        public byte[] getByteContent() {
+            return content;
+        }
+
+        public AbstractContent getSourceContent() {
+            return aContent;
+        }
+
+        @Override
+        public String getContentType() {
+            return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
+        }
+
+        @Override
+        public String getName() {
+            return aContent.getName();
+        }
+
+        @Override
+        public Reader getReader() throws IOException {
+            return new InputStreamReader(stream);
+
+        }
+
+        @Override
+        public Long getSize() {
+            return contentSize;
+        }
+
+        @Override
+        public String getSourceInfo() {
+            return NbBundle.getMessage(this.getClass(), "ByteContentStream.getSrcInfo.text", aContent.getId());
+        }
+
+        @Override
+        public InputStream getStream() throws IOException {
+            return stream;
+        }
+
+        @Override
+        protected void finalize() throws Throwable {
+            super.finalize();
+
+            stream.close();
+        }
+
+    }
+
 }

From 697a7d7a582c7ad71236decd9b032b8b07c4a02d Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 12:08:23 +0100
Subject: [PATCH 08/21] reduce method overloads for indexing artifacts

---
 .../sleuthkit/autopsy/keywordsearch/FileTextExtractor.java    | 2 --
 .../src/org/sleuthkit/autopsy/keywordsearch/Ingester.java     | 2 +-
 .../autopsy/keywordsearch/KeywordSearchIngestModule.java      | 4 ++--
 .../sleuthkit/autopsy/keywordsearch/SolrSearchService.java    | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
index eeee49ff74..e076b8ccd2 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@@ -192,7 +192,5 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
 
             stream.close();
         }
-
     }
-
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index d46eafa802..b900b1bc06 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -256,7 +256,7 @@ class Ingester {
             boolean eof = false;
             while (!eof) {
                 int totalRead = 0;
-                if (context.fileIngestIsCancelled()) {
+                if (context != null && context.fileIngestIsCancelled()) {
                     return true;
                 }
                 if ((readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) == -1) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index fc40f0070c..ceb810c444 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            FileTextExtractor extractor = null;
+            FileTextExtractor<?> extractor = null;
 
             //go over available text extractors in order, and pick the first one (most specific one)
-            for (FileTextExtractor fe : textExtractors) {
+            for (FileTextExtractor<?> fe : textExtractors) {
                 if (fe.isSupported(aFile, detectedFormat)) {
                     extractor = fe;
                     break;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index f1429b5849..4ecf65717a 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -62,7 +62,7 @@ public class SolrSearchService implements KeywordSearchService {
         }
 
         try {
-            Ingester.getDefault().indexText(extractor, artifact);
+            Ingester.getDefault().indexText(extractor, artifact, null);
         } catch (Ingester.IngesterException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }

From 2b4bb3379891e993eb37d2c4b28c8856aa28b198 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 13:01:09 +0100
Subject: [PATCH 09/21] cleanup up ArtifactExtractor; reduce use of
 ContentStream

---
 .../keywordsearch/ArtifactExtractor.java      |  59 +++-----
 .../autopsy/keywordsearch/Ingester.java       | 136 +++++-------------
 2 files changed, 59 insertions(+), 136 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
index 72b259e1a8..6d5c019111 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
@@ -1,7 +1,20 @@
 /*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2016 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
@@ -10,7 +23,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.HashMap;
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.common.util.ContentStream;
 import org.openide.util.Exceptions;
@@ -88,24 +100,16 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
             for (BlackboardAttribute attribute : artifact.getAttributes()) {
                 artifactContents.append(attribute.getAttributeType().getDisplayName());
                 artifactContents.append(" : ");
-
-                // This is ugly since it will need to updated any time a new
-                // TSK_DATETIME_* attribute is added. A slightly less ugly
-                // alternative would be to assume that all date time attributes
-                // will have a name of the form "TSK_DATETIME*" and check
-                // attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
-                // The major problem with that approach is that it would require
-                // a round trip to the database to get the type name string.
                 // We have also discussed modifying BlackboardAttribute.getDisplayString()
                 // to magically format datetime attributes but that is complicated by
                 // the fact that BlackboardAttribute exists in Sleuthkit data model
                 // while the utility to determine the timezone to use is in ContentUtils
                 // in the Autopsy datamodel.
-                if (attribute.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
-
-                    artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
-                } else {
-                    artifactContents.append(attribute.getDisplayString());
+                switch (attribute.getValueType()) {
+                    case DATETIME:
+                        artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
+                    default:
+                        artifactContents.append(attribute.getDisplayString());
                 }
                 artifactContents.append(System.lineSeparator());
             }
@@ -117,27 +121,6 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
             return null;
         }
 
-        // To play by the rules of the existing text markup implementations,
-        // we need to (a) index the artifact contents in a "chunk" and
-        // (b) create a separate index entry for the base artifact.
-        // We distinguish artifact content from file content by applying a
-        // mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
-        // First, create an index entry for the base artifact.
-        HashMap<String, String> solrFields = new HashMap<>();
-        String documentId = Long.toString(artifact.getArtifactID());
-
-        solrFields.put(Server.Schema.ID.toString(), documentId);
-
-        // Set the IMAGE_ID field.
-        solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
-
-        // Next create the index entry for the document content.
-        // The content gets added to a single chunk. We may need to add chunking
-        // support later.
-        long chunkId = 1;
-
-        documentId += "_" + Long.toString(chunkId);
-        solrFields.replace(Server.Schema.ID.toString(), documentId);
 
         return IOUtils.toInputStream(artifactContents);
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index b900b1bc06..74c74988ca 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -92,42 +92,17 @@ class Ingester {
      *
      * @param file          File to ingest
      * @param ingestContent if true, index the file and the content, otherwise
-     *                      indesx metadata only
+     *                      index metadata only
      *
      * @throws IngesterException if there was an error processing a specific
      *                           file, but the Solr server is probably fine.
      */
     void indexMetaDataOnly(AbstractFile file) throws IngesterException {
-        indexContentStream(new NullContentStream(file), getContentFields(file), 0);
+        indexChunk(null, file.getName(), getContentFields(file), 0);
     }
 
     void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
-
-//        indexContentStream(new NullContentStream(artifact), getContentFields(file), 0);
-    }
-
-    /**
-     * Sends a TextExtractor to Solr to have its content extracted and added to
-     * the index. commit() should be called once you're done ingesting files.
-     * FileExtract represents a parent of extracted file with actual content.
-     * The parent itself has no content, only meta data and is used to associate
-     * the extracted AbstractFileChunk
-     *
-     * @param fe TextExtractor to ingest
-     *
-     * @throws IngesterException if there was an error processing a specific
-     *                           file, but the Solr server is probably fine.
-     */
-    private void recordNumberOfChunks(AbstractFile file, int numChunks) throws IngesterException {
-        Map<String, String> params = getContentFields(file);
-        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
-        indexContentStream(new NullContentStream(file), params, 0);
-    }
-
-    private void recordNumberOfChunks(BlackboardArtifact artifact, int numChunks) throws IngesterException {
-        Map<String, String> params = getContentFields(artifact);
-        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
-        indexContentStream(new NullArtifactStream(artifact), params, 0);
+        indexChunk(null, artifact.getDisplayName() + "_" + artifact.getArtifactID(), getContentFields(artifact), 0);
     }
 
     /**
@@ -146,25 +121,24 @@ class Ingester {
      */
     static private class GetContentFieldsV extends SleuthkitItemVisitor.Default<Map<String, String>> {
 
+        @Override
+        protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
+            return new HashMap<>();
+        }
+
         @Override
         public Map<String, String> visit(File f) {
-            Map<String, String> params = getCommonFields(f);
-            getCommonFileContentFields(params, f);
-            return params;
+            return getCommonFileContentFields(f);
         }
 
         @Override
         public Map<String, String> visit(DerivedFile df) {
-            Map<String, String> params = getCommonFields(df);
-            getCommonFileContentFields(params, df);
-            return params;
+            return getCommonFileContentFields(df);
         }
 
         @Override
         public Map<String, String> visit(Directory d) {
-            Map<String, String> params = getCommonFields(d);
-            getCommonFileContentFields(params, d);
-            return params;
+            return getCommonFileContentFields(d);
         }
 
         @Override
@@ -175,19 +149,16 @@ class Ingester {
 
         @Override
         public Map<String, String> visit(LocalFile lf) {
-            Map<String, String> params = getCommonFields(lf);
-            getCommonFileContentFields(params, lf);
-            return params;
+            return getCommonFileContentFields(lf);
         }
 
         @Override
         public Map<String, String> visit(SlackFile f) {
-            Map<String, String> params = getCommonFields(f);
-            getCommonFileContentFields(params, f);
-            return params;
+            return getCommonFileContentFields(f);
         }
 
-        private Map<String, String> getCommonFileContentFields(Map<String, String> params, AbstractFile file) {
+        private Map<String, String> getCommonFileContentFields(AbstractFile file) {
+            Map<String, String> params = getCommonFields(file);
             params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
             params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
             params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file));
@@ -205,14 +176,12 @@ class Ingester {
                 logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS
                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
             }
-
             params.put(Server.Schema.FILE_NAME.toString(), af.getName());
             return params;
         }
 
         @Override
         public Map<String, String> visit(BlackboardArtifact artifact) {
-
             Map<String, String> params = new HashMap<>();
             params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
             try {
@@ -225,11 +194,6 @@ class Ingester {
 
             return params;
         }
-
-        @Override
-        protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
-            return new HashMap<>();
-        }
     }
 
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
@@ -300,10 +264,11 @@ class Ingester {
                 byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
 
                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
+                fields.put(Server.Schema.ID.toString(), chunkId);
                 try {
                     ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source);
                     try {
-                        indexContentStream(bcs, fields, encodedBytes.length);
+                        indexChunk(encodedBytes, sourceName, fields, encodedBytes.length);
                     } catch (Exception ex) {
                         throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
                     }
@@ -324,7 +289,8 @@ class Ingester {
         } finally {
             //after all chunks, ingest the parent file without content itself, and store numChunks
             fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
-            indexContentStream(extractor.getNullStream(source), fields, 0);
+            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
+            indexChunk(null, sourceName, fields, 0);
         }
         return true;
     }
@@ -362,16 +328,15 @@ class Ingester {
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    void indexContentStream(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
+    void indexChunk(byte[] docChunkContentBuf, String name, Map<String, String> fields, int size) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
             //skip the file, image id unknown
             String msg = NbBundle.getMessage(this.getClass(),
-                    "Ingester.ingest.exception.unknownImgId.msg", cs.getName());
+                    "Ingester.ingest.exception.unknownImgId.msg", name);
             logger.log(Level.SEVERE, msg);
             throw new IngesterException(msg);
         }
 
-        final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE];
         SolrInputDocument updateDoc = new SolrInputDocument();
 
         for (String key : fields.keySet()) {
@@ -381,46 +346,22 @@ class Ingester {
         //using size here, but we are no longer ingesting entire files
         //size is normally a chunk size, up to 1MB
         if (size > 0) {
-            // TODO (RC): Use try with resources, adjust exception messages
-            InputStream is = null;
-            int read = 0;
-            try {
-                is = cs.getStream();
-                read = is.read(docChunkContentBuf);
-            } catch (IOException ex) {
-                throw new IngesterException(
-                        NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg",
-                                cs.getName()));
-            } finally {
-                if (null != is) {
-                    try {
-                        is.close();
-                    } catch (IOException ex) {
-                        logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS
-                    }
-                }
-            }
-
-            if (read != 0) {
-                String s = "";
-                s = new String(docChunkContentBuf, 0, read, StandardCharsets.UTF_8);
-                char[] chars = null;
-                for (int i = 0; i < s.length(); i++) {
-                    if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
-                        // only convert string to char[] if there is a non-UTF8 character
-                        if (chars == null) {
-                            chars = s.toCharArray();
-                        }
-                        chars[i] = '^';
-                    }
-                }
-                if (chars != null) {
-                    s = new String(chars);
-                }
+            String s = new String(docChunkContentBuf, 0, size, StandardCharsets.UTF_8);
+//                char[] chars = null;
+//                for (int i = 0; i < s.length(); i++) {
+//                    if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
+//                        // only convert string to char[] if there is a non-UTF8 character
+//                        if (chars == null) {
+//                            chars = s.toCharArray();
+//                        }
+//                        chars[i] = '^';
+//                    }
+//                }
+//                if (chars != null) {
+//                    s = new String(chars);
+//                }
                 updateDoc.addField(Server.Schema.CONTENT.toString(), s);
-            } else {
-                updateDoc.addField(Server.Schema.CONTENT.toString(), "");
-            }
+
         } else {
             //no content, such as case when 0th chunk indexed
             updateDoc.addField(Server.Schema.CONTENT.toString(), "");
@@ -432,9 +373,8 @@ class Ingester {
             uncommitedIngests = true;
         } catch (KeywordSearchModuleException ex) {
             throw new IngesterException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex);
+                    NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", name), ex);
         }
-
     }
 
     /**
@@ -529,7 +469,7 @@ class Ingester {
 
         @Override
         public String getName() {
-            return aContent.getDisplayName();
+            return aContent.getDisplayName() + "_" + aContent.getArtifactID();
         }
 
         @NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n")

From abf21f58eedf14beeee7566bcf6acef706878e08 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 13:04:36 +0100
Subject: [PATCH 10/21] remove obsolete and unused ContentStreams

---
 .../AbstractFileStringContentStream.java      | 92 -------------------
 .../keywordsearch/ArtifactExtractor.java      |  9 +-
 .../keywordsearch/FileTextExtractor.java      |  9 --
 .../autopsy/keywordsearch/Ingester.java       | 90 ------------------
 .../autopsy/keywordsearch/TextExtractor.java  |  3 -
 5 files changed, 1 insertion(+), 202 deletions(-)
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java
deleted file mode 100644
index e8a7efdde0..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-2016 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.Charset;
-
-import org.openide.util.NbBundle;
-import org.apache.solr.common.util.ContentStream;
-import org.sleuthkit.datamodel.AbstractContent;
-import org.sleuthkit.datamodel.AbstractFile;
-
-/**
- * Wrapper over InputStream that implements ContentStream to feed to Solr.
- */
-class AbstractFileStringContentStream implements ContentStream {
-    //input
-
-    private final AbstractFile content;
-    private final Charset charset;
-    //converted
-    private final InputStream stream;
-
-    public AbstractFileStringContentStream(AbstractFile content, Charset charset, InputStream inputStream) {
-        this.content = content;
-        this.charset = charset;
-        this.stream = inputStream;
-    }
-
-    public AbstractContent getSourceContent() {
-        return content;
-    }
-
-    @Override
-    public String getContentType() {
-        return "text/plain;charset=" + charset.name(); //NON-NLS
-    }
-
-    @Override
-    public String getName() {
-        return content.getName();
-    }
-
-    @Override
-    public Reader getReader() throws IOException {
-        return new InputStreamReader(stream);
-
-    }
-
-    @Override
-    public Long getSize() {
-        //return convertedLength;
-        throw new UnsupportedOperationException(
-                NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSize.exception.msg"));
-    }
-
-    @Override
-    public String getSourceInfo() {
-        return NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSrcInfo.text", content.getId());
-    }
-
-    @Override
-    public InputStream getStream() throws IOException {
-        return stream;
-    }
-
-    @Override
-    protected void finalize() throws Throwable {
-        super.finalize();
-
-        stream.close();
-    }
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
index 6d5c019111..60f21f2a44 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
@@ -108,6 +108,7 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
                 switch (attribute.getValueType()) {
                     case DATETIME:
                         artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
+                        break;
                     default:
                         artifactContents.append(attribute.getDisplayString());
                 }
@@ -136,15 +137,7 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
         return source.getArtifactID();
     }
 
-    @Override
-    ContentStream getContentStream(byte[] encodedBytes, int length, BlackboardArtifact source) {
-        return new ByteArtifactStream(encodedBytes, length, source);
-    }
 
-    @Override
-    ContentStream getNullStream(BlackboardArtifact source) {
-        return new Ingester.NullArtifactStream(source);
-    }
 
     @Override
     String getName(BlackboardArtifact source) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
index e076b8ccd2..37d2e49b96 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@@ -115,15 +115,6 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
         return source.getId();
     }
 
-    @Override
-    ContentStream getContentStream(byte[] encodedBytes, int length, AbstractFile source) {
-        return new ByteContentStream(encodedBytes, length, source);
-    }
-
-    @Override
-    ContentStream getNullStream(AbstractFile source) {
-        return new Ingester.NullContentStream(source);
-    }
 
     @Override
     String getName(AbstractFile source) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 74c74988ca..b83f830b04 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -18,7 +18,6 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
@@ -28,13 +27,11 @@ import java.util.Map;
 import java.util.logging.Level;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.util.ContentStream;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.TextUtil;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
-import org.sleuthkit.datamodel.AbstractContent;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
@@ -266,7 +263,6 @@ class Ingester {
                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                 fields.put(Server.Schema.ID.toString(), chunkId);
                 try {
-                    ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source);
                     try {
                         indexChunk(encodedBytes, sourceName, fields, encodedBytes.length);
                     } catch (Exception ex) {
@@ -413,92 +409,6 @@ class Ingester {
         }
     }
 
-    /**
-     * ContentStream associated with FsContent, but forced with no content
-     */
-    static class NullContentStream implements ContentStream {
-
-        AbstractContent aContent;
-
-        NullContentStream(AbstractContent aContent) {
-            this.aContent = aContent;
-        }
-
-        @Override
-        public String getName() {
-            return aContent.getName();
-        }
-
-        @Override
-        public String getSourceInfo() {
-            return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId());
-        }
-
-        @Override
-        public String getContentType() {
-            return null;
-        }
-
-        @Override
-        public Long getSize() {
-            return 0L;
-        }
-
-        @Override
-        public InputStream getStream() throws IOException {
-            return new ByteArrayInputStream(new byte[0]);
-        }
-
-        @Override
-        public Reader getReader() throws IOException {
-            throw new UnsupportedOperationException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
-        }
-    }
-
-    /**
-     * ContentStream associated with Artifact, but forced with no content
-     */
-    static class NullArtifactStream implements ContentStream {
-
-        BlackboardArtifact aContent;
-
-        NullArtifactStream(BlackboardArtifact aContent) {
-            this.aContent = aContent;
-        }
-
-        @Override
-        public String getName() {
-            return aContent.getDisplayName() + "_" + aContent.getArtifactID();
-        }
-
-        @NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n")
-        @Override
-        public String getSourceInfo() {
-            return Bundle.Ingester_NullArtifactStream_getSrcInfo_text(aContent.getArtifactID());
-        }
-
-        @Override
-        public String getContentType() {
-            return null;
-        }
-
-        @Override
-        public Long getSize() {
-            return 0L;
-        }
-
-        @Override
-        public InputStream getStream() throws IOException {
-            return new ByteArrayInputStream(new byte[0]);
-        }
-
-        @Override
-        public Reader getReader() throws IOException {
-            throw new UnsupportedOperationException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
-        }
-    }
 
     /**
      * Indicates that there was an error with the specific ingest operation, but
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index bbb52582e4..1e5eef2d18 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.InputStream;
 import java.io.Reader;
-import org.apache.solr.common.util.ContentStream;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 
 abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
@@ -41,7 +40,5 @@ abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisit
 
     abstract long getID(TextSource source);
 
-    abstract ContentStream getContentStream(byte[] encodedBytes, int length, TextSource source);
     abstract String getName(TextSource source);
-    abstract ContentStream getNullStream(TextSource source);
 }

From 0303c96d41317087239eeeeb85422e4fd0b8c1c3 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 13:14:28 +0100
Subject: [PATCH 11/21] cleanup Ingester.indexChunk

---
 .../autopsy/keywordsearch/Ingester.java       | 42 +++++--------------
 1 file changed, 10 insertions(+), 32 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index b83f830b04..b0e24d4570 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -21,7 +21,6 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.logging.Level;
@@ -253,18 +252,15 @@ class Ingester {
                 if (eof) {
                     extractor.appendDataToFinalChunk(sb, appendix);
                 }
+
                 sanitizeToUTF8(sb);
 
                 final String chunkString = sb.toString();
-
-                //encode to bytes as UTF-8 to index as byte stream
-                byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
-
                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                 fields.put(Server.Schema.ID.toString(), chunkId);
                 try {
                     try {
-                        indexChunk(encodedBytes, sourceName, fields, encodedBytes.length);
+                        indexChunk(chunkString, sourceName, fields, chunkString.length());
                     } catch (Exception ex) {
                         throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
                     }
@@ -296,6 +292,9 @@ class Ingester {
      *
      * @param totalRead    the number of chars in textChunkBuf
      * @param textChunkBuf the characters to sanitize
+     *
+     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
+     * function?
      */
     private static void sanitizeToUTF8(StringBuilder sb) {
         final int length = sb.length();
@@ -324,11 +323,11 @@ class Ingester {
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    void indexChunk(byte[] docChunkContentBuf, String name, Map<String, String> fields, int size) throws IngesterException {
+    void indexChunk(String chunk, String sourceName, Map<String, String> fields, int size) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
             //skip the file, image id unknown
-            String msg = NbBundle.getMessage(this.getClass(),
-                    "Ingester.ingest.exception.unknownImgId.msg", name);
+            String msg = NbBundle.getMessage(Ingester.class,
+                    "Ingester.ingest.exception.unknownImgId.msg", sourceName);
             logger.log(Level.SEVERE, msg);
             throw new IngesterException(msg);
         }
@@ -341,27 +340,7 @@ class Ingester {
 
         //using size here, but we are no longer ingesting entire files
         //size is normally a chunk size, up to 1MB
-        if (size > 0) {
-            String s = new String(docChunkContentBuf, 0, size, StandardCharsets.UTF_8);
-//                char[] chars = null;
-//                for (int i = 0; i < s.length(); i++) {
-//                    if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
-//                        // only convert string to char[] if there is a non-UTF8 character
-//                        if (chars == null) {
-//                            chars = s.toCharArray();
-//                        }
-//                        chars[i] = '^';
-//                    }
-//                }
-//                if (chars != null) {
-//                    s = new String(chars);
-//                }
-                updateDoc.addField(Server.Schema.CONTENT.toString(), s);
-
-        } else {
-            //no content, such as case when 0th chunk indexed
-            updateDoc.addField(Server.Schema.CONTENT.toString(), "");
-        }
+        updateDoc.addField(Server.Schema.CONTENT.toString(), (size > 0) ? chunk : "");
 
         try {
             //TODO consider timeout thread, or vary socket timeout based on size of indexed content
@@ -369,7 +348,7 @@ class Ingester {
             uncommitedIngests = true;
         } catch (KeywordSearchModuleException ex) {
             throw new IngesterException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", name), ex);
+                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
         }
     }
 
@@ -409,7 +388,6 @@ class Ingester {
         }
     }
 
-
     /**
      * Indicates that there was an error with the specific ingest operation, but
      * it's still okay to continue ingesting files.

From b904c37dd2b41783e8406b73584dd2bc99c56051 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 13:31:58 +0100
Subject: [PATCH 12/21] remove more unneeded ContentStreams  and cleanup
 logging

---
 .../keywordsearch/ArtifactExtractor.java      | 80 +------------------
 .../keywordsearch/FileTextExtractor.java      | 73 -----------------
 .../keywordsearch/HtmlTextExtractor.java      |  8 --
 .../keywordsearch/StringsTextExtractor.java   |  6 --
 .../autopsy/keywordsearch/TextExtractor.java  |  7 +-
 .../keywordsearch/TikaTextExtractor.java      |  5 +-
 6 files changed, 11 insertions(+), 168 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
index 60f21f2a44..712d551cc5 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
@@ -18,19 +18,15 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.logging.Level;
 import org.apache.commons.io.IOUtils;
-import org.apache.solr.common.util.ContentStream;
 import org.openide.util.Exceptions;
-import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
-import static org.sleuthkit.autopsy.keywordsearch.Bundle.ByteArtifactStream_getSrcInfo_text;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
@@ -39,6 +35,7 @@ import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
 
 public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
+    static final private Logger logger = Logger.getLogger(ArtifactExtractor.class.getName());
 
     static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
         Content dataSource;
@@ -76,9 +73,8 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
 
     @Override
     void logWarning(String msg, Exception ex) {
-        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
-
     @Override
     Void newAppendixProvider() {
         return null;
@@ -122,7 +118,6 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
             return null;
         }
 
-
         return IOUtils.toInputStream(artifactContents);
 
     }
@@ -137,75 +132,8 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
         return source.getArtifactID();
     }
 
-
-
     @Override
     String getName(BlackboardArtifact source) {
-        return source.getDisplayName();
-    }
-
-    static private class ByteArtifactStream implements ContentStream {
-
-        //input
-        private final byte[] content; //extracted subcontent
-        private long contentSize;
-        private final BlackboardArtifact aContent; //origin
-
-        private final InputStream stream;
-
-        private static final Logger logger = Logger.getLogger(ByteArtifactStream.class.getName());
-
-        public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
-            this.content = content;
-            this.aContent = aContent;
-            stream = new ByteArrayInputStream(content, 0, (int) contentSize);
-        }
-
-        public byte[] getByteContent() {
-            return content;
-        }
-
-        public BlackboardArtifact getSourceContent() {
-            return aContent;
-        }
-
-        @Override
-        public String getContentType() {
-            return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
-        }
-
-        @Override
-        public String getName() {
-            return aContent.getDisplayName();
-        }
-
-        @Override
-        public Reader getReader() throws IOException {
-            return new InputStreamReader(stream);
-
-        }
-
-        @Override
-        public Long getSize() {
-            return contentSize;
-        }
-
-        @Override
-        @NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
-        public String getSourceInfo() {
-            return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
-        }
-
-        @Override
-        public InputStream getStream() throws IOException {
-            return stream;
-        }
-
-        @Override
-        protected void finalize() throws Throwable {
-            super.finalize();
-
-            stream.close();
-        }
+        return source.getDisplayName() + "_" + source.getArtifactID();
     }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
index 37d2e49b96..bec832f8df 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@@ -18,17 +18,8 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
-import org.apache.solr.common.util.ContentStream;
-import org.openide.util.NbBundle;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.datamodel.AbstractContent;
 import org.sleuthkit.datamodel.AbstractFile;
 
 /**
@@ -120,68 +111,4 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
     String getName(AbstractFile source) {
         return source.getName();
     }
-
-    private static class ByteContentStream implements ContentStream {
-
-        //input
-        private final byte[] content; //extracted subcontent
-        private long contentSize;
-        private final AbstractContent aContent; //origin
-
-        private final InputStream stream;
-
-        private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
-
-        public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent) {
-            this.content = content;
-            this.aContent = aContent;
-            stream = new ByteArrayInputStream(content, 0, (int) contentSize);
-        }
-
-        public byte[] getByteContent() {
-            return content;
-        }
-
-        public AbstractContent getSourceContent() {
-            return aContent;
-        }
-
-        @Override
-        public String getContentType() {
-            return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
-        }
-
-        @Override
-        public String getName() {
-            return aContent.getName();
-        }
-
-        @Override
-        public Reader getReader() throws IOException {
-            return new InputStreamReader(stream);
-
-        }
-
-        @Override
-        public Long getSize() {
-            return contentSize;
-        }
-
-        @Override
-        public String getSourceInfo() {
-            return NbBundle.getMessage(this.getClass(), "ByteContentStream.getSrcInfo.text", aContent.getId());
-        }
-
-        @Override
-        public InputStream getStream() throws IOException {
-            return stream;
-        }
-
-        @Override
-        protected void finalize() throws Throwable {
-            super.finalize();
-
-            stream.close();
-        }
-    }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
index 5387dd7619..ffaf9a32b2 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@@ -24,13 +24,11 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
-import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Renderer;
 import net.htmlparser.jericho.Source;
 import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
-import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
@@ -41,8 +39,6 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  */
 class HtmlTextExtractor extends FileTextExtractor<Void> {
 
-    private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
-
     static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
     private static final int MAX_SIZE = 50000000;
 
@@ -58,10 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
     HtmlTextExtractor() {
     }
 
-    @Override
-    void logWarning(final String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS
-    }
 
     @Override
     boolean isContentTypeSpecific() {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
index ddc7d06362..165bcd7591 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@@ -25,7 +25,6 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@@ -40,7 +39,6 @@ import org.sleuthkit.datamodel.TskException;
  */
 class StringsTextExtractor extends FileTextExtractor<Void> {
 
-    private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
     private final List<SCRIPT> extractScripts = new ArrayList<>();
     private Map<String, String> extractOptions = new HashMap<>();
@@ -141,10 +139,6 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
         return true;
     }
 
-    @Override
-    void logWarning(String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS
-    }
 
     @Override
     Void newAppendixProvider() {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index 1e5eef2d18..2e1d3280bd 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -20,13 +20,18 @@ package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.InputStream;
 import java.io.Reader;
+import java.util.logging.Level;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 
 abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
 
+    static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
     abstract boolean noExtractionOptionsAreEnabled();
 
-    abstract void logWarning(final String msg, Exception ex);
+    void logWarning(String msg, Exception ex) {
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
+    }
 
     void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
         //no-op
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
index c7a4648709..db50ebef49 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@@ -34,7 +34,6 @@ import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
-import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@@ -51,9 +50,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  */
 class TikaTextExtractor extends FileTextExtractor<Metadata> {
 
-    private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
-
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
 
     private static final List<String> TIKA_SUPPORTED_TYPES
@@ -65,7 +62,7 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
     @Override
     void logWarning(final String msg, Exception ex) {
         KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-        logger.log(Level.WARNING, msg, ex);
+        super.logWarning(msg, ex);
     }
 
     @Override

From c42f687bfbd37e94db632a5093757fab952c29f3 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 15:27:55 +0100
Subject: [PATCH 13/21] more cleanup

more cleanup
---
 ...ractor.java => ArtifactTextExtractor.java} |   9 +-
 .../autopsy/keywordsearch/Ingester.java       | 159 ++++++++----------
 .../keywordsearch/SolrSearchService.java      |  14 +-
 .../keywordsearch/TikaTextExtractor.java      |  24 ++-
 4 files changed, 103 insertions(+), 103 deletions(-)
 rename KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/{ArtifactExtractor.java => ArtifactTextExtractor.java} (94%)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
similarity index 94%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
rename to KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
index 712d551cc5..0c1caeebe2 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@@ -21,7 +21,6 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.logging.Level;
 import org.apache.commons.io.IOUtils;
 import org.openide.util.Exceptions;
 import org.sleuthkit.autopsy.casemodule.Case;
@@ -34,8 +33,8 @@ import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
 
-public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
-    static final private Logger logger = Logger.getLogger(ArtifactExtractor.class.getName());
+public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifact> {
+    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
 
     static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
         Content dataSource;
@@ -71,10 +70,6 @@ public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
         return false;
     }
 
-    @Override
-    void logWarning(String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
     @Override
     Void newAppendixProvider() {
         return null;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index b0e24d4570..566461c185 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -33,7 +33,6 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.DerivedFile;
 import org.sleuthkit.datamodel.Directory;
 import org.sleuthkit.datamodel.File;
@@ -47,16 +46,17 @@ import org.sleuthkit.datamodel.TskCoreException;
 /**
  * Handles indexing files on a Solr core.
  */
+//JMTODO: Should this class really be a singleton?
 class Ingester {
 
     private static final Logger logger = Logger.getLogger(Ingester.class.getName());
     private volatile boolean uncommitedIngests = false;
     private final Server solrServer = KeywordSearch.getServer();
-    private static final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
+    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
     private static Ingester instance;
 
-    //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
-    //TODO use a streaming way to add content to /update handler
+    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
+    private static final int SINGLE_READ_CHARS = 1024;
     private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024;
 
     private Ingester() {
@@ -69,6 +69,7 @@ class Ingester {
         return instance;
     }
 
+    //JMTODO: this is probably useless
     @Override
     @SuppressWarnings("FinalizeDeclaration")
     protected void finalize() throws Throwable {
@@ -81,14 +82,11 @@ class Ingester {
     }
 
     /**
-     * Sends a file to Solr to have its content extracted and added to the
-     * index. commit() should be called once you're done ingesting files. If the
-     * file is a directory or ingestContent is set to false, the file name is
-     * indexed only.
+     * Sends the metadata (name, MAC times, image id, etc) for the given file to
+     * Solr to be added to the index. commit() should be called once you're done
+     * indexing.
      *
-     * @param file          File to ingest
-     * @param ingestContent if true, index the file and the content, otherwise
-     *                      index metadata only
+     * @param file File to index.
      *
      * @throws IngesterException if there was an error processing a specific
      *                           file, but the Solr server is probably fine.
@@ -97,25 +95,35 @@ class Ingester {
         indexChunk(null, file.getName(), getContentFields(file), 0);
     }
 
+    /**
+     * Sends the metadata (artifact id, image id, etc) for the given artifact to
+     * Solr to be added to the index. commit() should be called once you're done
+     * indexing.
+     *
+     * @param artifact The artifact to index.
+     *
+     * @throws IngesterException if there was an error processing a specific
+     *                           artifact, but the Solr server is probably fine.
+     */
     void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
-        indexChunk(null, artifact.getDisplayName() + "_" + artifact.getArtifactID(), getContentFields(artifact), 0);
+        indexChunk(null, new ArtifactTextExtractor().getName(artifact), getContentFields(artifact), 0);
     }
 
     /**
      * Creates a field map from FsContent, that is later sent to Solr
      *
-     * @param fsc FsContent to get fields from
+     * @param item SleuthkitVisitableItem to get fields from
      *
      * @return the map
      */
-    Map<String, String> getContentFields(SleuthkitVisitableItem fsc) {
-        return fsc.accept(getContentFieldsV);
+    Map<String, String> getContentFields(SleuthkitVisitableItem item) {
+        return item.accept(SOLR_FIELDS_VISITOR);
     }
 
     /**
-     * Visitor used to create param list to send to SOLR index.
+     * Visitor used to create fields to send to SOLR index.
      */
-    static private class GetContentFieldsV extends SleuthkitItemVisitor.Default<Map<String, String>> {
+    static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
 
         @Override
         protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
@@ -124,17 +132,17 @@ class Ingester {
 
         @Override
         public Map<String, String> visit(File f) {
-            return getCommonFileContentFields(f);
+            return getFileFields(f);
         }
 
         @Override
         public Map<String, String> visit(DerivedFile df) {
-            return getCommonFileContentFields(df);
+            return getFileFields(df);
         }
 
         @Override
         public Map<String, String> visit(Directory d) {
-            return getCommonFileContentFields(d);
+            return getFileFields(d);
         }
 
         @Override
@@ -145,15 +153,15 @@ class Ingester {
 
         @Override
         public Map<String, String> visit(LocalFile lf) {
-            return getCommonFileContentFields(lf);
+            return getFileFields(lf);
         }
 
         @Override
         public Map<String, String> visit(SlackFile f) {
-            return getCommonFileContentFields(f);
+            return getFileFields(f);
         }
 
-        private Map<String, String> getCommonFileContentFields(AbstractFile file) {
+        private Map<String, String> getFileFields(AbstractFile file) {
             Map<String, String> params = getCommonFields(file);
             params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
             params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
@@ -166,10 +174,9 @@ class Ingester {
             Map<String, String> params = new HashMap<>();
             params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
             try {
-                long dataSourceId = af.getDataSource().getId();
-                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(af.getDataSource().getId()));
             } catch (TskCoreException ex) {
-                logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS
+                logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId(), ex); //NON-NLS
                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
             }
             params.put(Server.Schema.FILE_NAME.toString(), af.getName());
@@ -181,29 +188,26 @@ class Ingester {
             Map<String, String> params = new HashMap<>();
             params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
             try {
-                Content dataSource = ArtifactExtractor.getDataSource(artifact);
-                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId()));
             } catch (TskCoreException ex) {
-                logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS
+                logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
             }
-
             return params;
         }
     }
 
-    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
-    private static final int SINGLE_READ_CHARS = 1024;
-    private static final int EXTRA_CHARS = 128; //for whitespace
 
     public <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
+        final long sourceID = extractor.getID(source);
+        final String sourceName = extractor.getName(source);
+
         int numChunks = 0; //unknown until chunking is done
 
         if (extractor.noExtractionOptionsAreEnabled()) {
             return true;
         }
-        final long sourceID = extractor.getID(source);
-        final String sourceName = extractor.getName(source);
+
         Map<String, String> fields = getContentFields(source);
 
         A appendix = extractor.newAppendixProvider();
@@ -212,64 +216,64 @@ class Ingester {
 
             //we read max 1024 chars at time, this seems to max what this Reader would return
             char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
-            long readSize;
+
             boolean eof = false;
             while (!eof) {
-                int totalRead = 0;
+                int chunkSizeInChars = 0;
                 if (context != null && context.fileIngestIsCancelled()) {
                     return true;
                 }
-                if ((readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) == -1) {
-                    eof = true;
-                } else {
-                    totalRead += readSize;
+                long charsRead = 0;
+                //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
+                while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
+                        && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
+                    chunkSizeInChars += charsRead;
                 }
 
-                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
-                while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
-                        && (readSize = reader.read(textChunkBuf, totalRead, SINGLE_READ_CHARS)) != -1) {
-                    totalRead += readSize;
-                }
-                if (readSize == -1) {
+                if (charsRead == -1) {
                     //this is the last chunk
                     eof = true;
                 } else {
+                    chunkSizeInChars += charsRead;
+
                     //try to read char-by-char until whitespace to not break words
-                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
-                            && !Character.isWhitespace(textChunkBuf[totalRead - 1])
-                            && (readSize = reader.read(textChunkBuf, totalRead, 1)) != -1) {
-                        totalRead += readSize;
+                    while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
+                            && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
+                            && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
+                        chunkSizeInChars += charsRead;
                     }
-                    if (readSize == -1) {
+                    if (charsRead == -1) {
                         //this is the last chunk
                         eof = true;
                     }
                 }
 
-                StringBuilder sb = new StringBuilder(totalRead + 1000)
-                        .append(textChunkBuf, 0, totalRead);
-
+                StringBuilder sb;
                 if (eof) {
+                    //1000 char buffer is to allow for appendix data with out needing to resize the string builder.
+                    sb = new StringBuilder(chunkSizeInChars + 1000)
+                            .append(textChunkBuf, 0, chunkSizeInChars);
                     extractor.appendDataToFinalChunk(sb, appendix);
+                } else {
+                    sb = new StringBuilder(chunkSizeInChars)
+                            .append(textChunkBuf, 0, chunkSizeInChars);
+
                 }
 
                 sanitizeToUTF8(sb);
 
-                final String chunkString = sb.toString();
                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                 fields.put(Server.Schema.ID.toString(), chunkId);
                 try {
-                    try {
-                        indexChunk(chunkString, sourceName, fields, chunkString.length());
-                    } catch (Exception ex) {
-                        throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
-                    }
+                    indexChunk(sb.toString(), sourceName, fields, sb.length());
                     numChunks++;
                 } catch (Ingester.IngesterException ingEx) {
                     extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
                             + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
 
                     throw ingEx; //need to rethrow to signal error and move on
+                } catch (Exception ex) {
+                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
                 }
             }
         } catch (IOException ex) {
@@ -325,15 +329,18 @@ class Ingester {
      */
     void indexChunk(String chunk, String sourceName, Map<String, String> fields, int size) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
+            //JMTODO: actually if the we couldn't get the image id it is set to -1,
+            // but does this really mean we don't want to index it?
+
             //skip the file, image id unknown
+            //JMTODO: does this need to ne internationalized?
             String msg = NbBundle.getMessage(Ingester.class,
-                    "Ingester.ingest.exception.unknownImgId.msg", sourceName);
+                    "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
             logger.log(Level.SEVERE, msg);
             throw new IngesterException(msg);
         }
 
         SolrInputDocument updateDoc = new SolrInputDocument();
-
         for (String key : fields.keySet()) {
             updateDoc.addField(key, fields.get(key));
         }
@@ -343,38 +350,16 @@ class Ingester {
         updateDoc.addField(Server.Schema.CONTENT.toString(), (size > 0) ? chunk : "");
 
         try {
-            //TODO consider timeout thread, or vary socket timeout based on size of indexed content
+            //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
             solrServer.addDocument(updateDoc);
             uncommitedIngests = true;
         } catch (KeywordSearchModuleException ex) {
+            //JMTODO: does this need to ne internationalized?
             throw new IngesterException(
                     NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
         }
     }
 
-    /**
-     * return timeout that should be used to index the content
-     *
-     * @param size size of the content
-     *
-     * @return time in seconds to use a timeout
-     */
-    static int getTimeout(long size) {
-        if (size < 1024 * 1024L) //1MB
-        {
-            return 60;
-        } else if (size < 10 * 1024 * 1024L) //10MB
-        {
-            return 1200;
-        } else if (size < 100 * 1024 * 1024L) //100MB
-        {
-            return 3600;
-        } else {
-            return 3 * 3600;
-        }
-
-    }
-
     /**
      * Tells Solr to commit (necessary before ingested files will appear in
      * searches)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 4ecf65717a..233549caed 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -41,7 +41,7 @@ public class SolrSearchService implements KeywordSearchService {
     private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
     private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
 
-    ArtifactExtractor extractor = new ArtifactExtractor();
+    ArtifactTextExtractor extractor = new ArtifactTextExtractor();
 
     @Override
     public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
@@ -54,17 +54,15 @@ public class SolrSearchService implements KeywordSearchService {
         if (artifact.getArtifactID() > 0) {
             return;
         }
+        final Ingester ingester = Ingester.getDefault();
 
         try {
-            Ingester.getDefault().indexMetaDataOnly(artifact);
-        } catch (Ingester.IngesterException ex) {
-            throw new TskCoreException(ex.getCause().getMessage(), ex);
-        }
-
-        try {
-            Ingester.getDefault().indexText(extractor, artifact, null);
+            ingester.indexMetaDataOnly(artifact);
+            ingester.indexText(extractor, artifact, null);
         } catch (Ingester.IngesterException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
+        } finally {
+            ingester.commit();
         }
     }
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
index db50ebef49..06d489363c 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@@ -74,6 +74,7 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
     public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
 
         //TODO: How do we account for this in chunking algorithm...
+        //JM: what if we always append it as a separate chunk?
         sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
         Stream.of(meta.names()).sorted().forEach(key -> {
             sb.append(key).append(": ").append(meta.get(key)).append("\n");
@@ -85,7 +86,7 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
         //Parse the file in a task
         final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
         try {
-            return future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+            return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
         } catch (TimeoutException te) {
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
             logWarning(msg, te);
@@ -129,5 +130,26 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
     boolean noExtractionOptionsAreEnabled() {
         return false;
     }
+    /**
+     * return timeout that should be used to index the content
+     *
+     * @param size size of the content
+     *
+     * @return time in seconds to use a timeout
+     */
+    static int getTimeout(long size) {
+        if (size < 1024 * 1024L) //1MB
+        {
+            return 60;
+        } else if (size < 10 * 1024 * 1024L) //10MB
+        {
+            return 1200;
+        } else if (size < 100 * 1024 * 1024L) //100MB
+        {
+            return 3600;
+        } else {
+            return 3 * 3600;
+        }
 
+    }
 }

From 9b8528419482e37c931ac4e02e786ad9405a0d07 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 17:11:51 +0100
Subject: [PATCH 14/21] remove unused outerclasses that have copies as
 innerclasses

---
 .../AbstractFileStringIntStream.java          | 213 -------------
 .../AbstractFileStringStream.java             | 296 ------------------
 2 files changed, 509 deletions(-)
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java
 delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java
deleted file mode 100644
index 7b6ba6458c..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2012 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.List;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.TskCoreException;
-
-/**
- * Wrapper over StringExtract to provide streaming API Given AbstractFile
- * object, extract international strings from the file and read output as a
- * stream of UTF-8 strings as encoded bytes.
- *
- */
-class AbstractFileStringIntStream extends InputStream {
-
-    private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
-    private static final int FILE_BUF_SIZE = 1024 * 1024;
-    private AbstractFile content;
-    private final byte[] oneCharBuf = new byte[1];
-    private final StringExtract stringExtractor;
-    private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
-    private long fileReadOffset = 0L;
-    private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
-    private int convertBuffOffset = 0; //offset to start returning data to user on next read()
-    private int bytesInConvertBuff = 0; //amount of data currently in the buffer
-    private boolean fileEOF = false; //if file has more bytes to read
-    private boolean extractUTF8;
-    private boolean extractUTF16;
-    private Charset outCharset;
-
-    private StringExtractResult lastExtractResult;
-
-    /**
-     * Constructs new stream object that does conversion from file, to extracted
-     * strings, then to byte stream, for specified script, auto-detected
-     * encoding (UTF8, UTF16LE, UTF16BE), and specified output byte stream
-     * encoding
-     *
-     * @param content      input content to process and turn into a stream to
-     *                     convert into strings
-     * @param scripts      a list of scripts to consider
-     * @param extractUTF8  whether to extract utf8 encoding
-     * @param extractUTF16 whether to extract utf16 encoding
-     * @param outCharset   encoding to use in the output byte stream
-     */
-    public AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8,
-            boolean extractUTF16, Charset outCharset) {
-        this.content = content;
-        this.stringExtractor = new StringExtract();
-        this.stringExtractor.setEnabledScripts(scripts);
-        this.extractUTF8 = extractUTF8;
-        this.extractUTF16 = extractUTF16;
-        this.outCharset = outCharset;
-        this.stringExtractor.setEnableUTF8(extractUTF8);
-        this.stringExtractor.setEnableUTF16(extractUTF16);
-    }
-
-    @Override
-    public int read() throws IOException {
-        if (extractUTF8 == false && extractUTF16 == false) {
-            return -1;
-        }
-        final int read = read(oneCharBuf, 0, 1);
-        if (read == 1) {
-            return oneCharBuf[0];
-        } else {
-            return -1;
-        }
-
-    }
-
-    @Override
-    public int read(byte[] b, int off, int len) throws IOException {
-        if (b == null) {
-            throw new NullPointerException();
-        } else if (off < 0 || len < 0 || len > b.length - off) {
-            throw new IndexOutOfBoundsException();
-        } else if (len == 0) {
-            return 0;
-        }
-
-        if (extractUTF8 == false && extractUTF16 == false) {
-            return -1;
-        }
-
-        long fileSize = content.getSize();
-        if (fileSize == 0) {
-            return -1;
-        }
-
-        //read and convert until user buffer full
-        //we have data if file can be read or when byteBuff has converted strings to return
-        int bytesToUser = 0; //returned to user so far
-        int offsetUser = off;
-        while (bytesToUser < len && offsetUser < len) {
-            //check if we have enough converted strings         
-            int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
-
-            if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
-                try {
-                    //convert more strings, store in buffer
-                    long toRead = 0;
-                    //int shiftSize = 0;
-
-                    //if (lastExtractResult != null && lastExtractResult.getTextLength() != 0
-                    //      && (shiftSize = FILE_BUF_SIZE - lastExtractResult.getFirstUnprocessedOff()) > 0) {
-                    ////a string previously extracted
-                    ////shift the fileReadBuff past last bytes extracted
-                    ////read only what's needed to fill the buffer
-                    ////to avoid loosing chars and breaking or corrupting potential strings - preserve byte stream continuity
-                    //byte[] temp = new byte[shiftSize];
-                    //System.arraycopy(fileReadBuff, lastExtractResult.getFirstUnprocessedOff(),
-                    //        temp, 0, shiftSize);
-                    //System.arraycopy(temp, 0, fileReadBuff, 0, shiftSize);
-                    //toRead = Math.min(lastExtractResult.getFirstUnprocessedOff(), fileSize - fileReadOffset);
-                    //lastExtractResult = null;
-                    //} else { 
-                    //fill up entire fileReadBuff fresh
-                    toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
-                    //}
-                    int read = content.read(fileReadBuff, fileReadOffset, toRead);
-                    if (read == -1 || read == 0) {
-                        fileEOF = true;
-                    } else {
-                        fileReadOffset += read;
-                        if (fileReadOffset >= fileSize) {
-                            fileEOF = true;
-                        }
-
-                        //put converted string in convertBuff
-                        convert(read);
-                        convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
-                    }
-                } catch (TskCoreException ex) {
-                    //Exceptions.printStackTrace(ex);
-                    fileEOF = true;
-                }
-            }
-
-            //nothing more to read, and no more bytes in convertBuff
-            if (convertBuff == null || convertBuffRemain == 0) {
-                if (fileEOF) {
-                    return bytesToUser > 0 ? bytesToUser : -1;
-                } else {
-                    //no strings extracted, try another read
-                    continue;
-                }
-            }
-
-            //return part or all of convert buff to user
-            final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
-            System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
-
-            //DEBUG
-            /*
-             * if (toCopy > 0) { FileOutputStream debug = new
-             * FileOutputStream("c:\\temp\\" + content.getName(), true);
-             * debug.write(b, offsetUser, toCopy); debug.close(); }
-             */
-            convertBuffOffset += toCopy;
-            offsetUser += toCopy;
-
-            bytesToUser += toCopy;
-
-        }
-
-        //if more string data in convertBuff, will be consumed on next read()
-        return bytesToUser;
-    }
-
-    /**
-     * convert bytes in file buffer to string, and encode string in
-     * convertBuffer
-     *
-     * @param numBytes num bytes in the fileReadBuff
-     */
-    private void convert(int numBytes) {
-        lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
-        convertBuff = lastExtractResult.getText().getBytes(outCharset);
-
-        //reset tracking vars
-        if (lastExtractResult.getNumBytes() == 0) {
-            bytesInConvertBuff = 0;
-        } else {
-            bytesInConvertBuff = convertBuff.length;
-        }
-        convertBuffOffset = 0;
-    }
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java
deleted file mode 100644
index 6a8dec3318..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2012 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.TskException;
-
-/**
- * AbstractFile input string stream reader/converter - given AbstractFile,
- * extract strings from it and return encoded bytes via read()
- *
- * Note: the utility supports extraction of only LATIN script and UTF8, UTF16LE,
- * UTF16BE encodings and uses a brute force encoding detection - it's fast but
- * could apply multiple encodings on the same string.
- *
- * For other script/languages support and better encoding detection use
- * AbstractFileStringIntStream streaming class, which wraps around StringExtract
- * extractor.
- */
-class AbstractFileStringStream extends InputStream {
-
-    //args
-    private AbstractFile content;
-    private Charset outputCharset;
-    //internal data
-    private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
-    private static final String NLS = Character.toString((char) 10); //new line
-    private static final int READ_BUF_SIZE = 256;
-    private long contentOffset = 0; //offset in fscontent read into curReadBuf    
-    private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
-    private int bytesInReadBuf = 0;
-    private int readBufOffset = 0; //offset in read buf processed
-    private StringBuilder curString = new StringBuilder();
-    private int curStringLen = 0;
-    private StringBuilder tempString = new StringBuilder();
-    private int tempStringLen = 0;
-    private boolean isEOF = false;
-    private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
-    private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
-    private boolean inString = false; //if current temp has min chars required
-    private final byte[] oneCharBuf = new byte[1];
-    private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
-
-    /**
-     * Construct new string stream from FsContent
-     *
-     * @param content                to extract strings from
-     * @param outputCharset          target encoding to index as
-     * @param preserveOnBuffBoundary whether to preserve or split string on a
-     *                               buffer boundary. If false, will pack into
-     *                               read buffer up to max. possible,
-     *                               potentially splitting a string. If false,
-     *                               the string will be preserved for next read.
-     */
-    public AbstractFileStringStream(AbstractFile content, Charset outputCharset, boolean preserveOnBuffBoundary) {
-        this.content = content;
-        this.outputCharset = outputCharset;
-        //this.preserveOnBuffBoundary = preserveOnBuffBoundary;
-        //logger.log(Level.INFO, "FILE: " + content.getParentPath() + "/" + content.getName());
-    }
-
-    /**
-     * Construct new string stream from FsContent Do not attempt to fill entire
-     * read buffer if that would break a string
-     *
-     * @param content    to extract strings from
-     * @param outCharset target charset to encode into bytes and index as, e.g.
-     *                   UTF-8
-     */
-    public AbstractFileStringStream(AbstractFile content, Charset outCharset) {
-        this(content, outCharset, false);
-    }
-
-    @Override
-    public int read(byte[] b, int off, int len) throws IOException {
-        if (b == null) {
-            throw new NullPointerException();
-        } else if (off < 0 || len < 0 || len > b.length - off) {
-            throw new IndexOutOfBoundsException();
-        } else if (len == 0) {
-            return 0;
-        }
-
-        long fileSize = content.getSize();
-        if (fileSize == 0) {
-            return -1;
-        }
-
-        if (isEOF) {
-            return -1;
-        }
-
-        if (stringAtTempBoundary) {
-            //append entire temp string residual from previous read()
-            //because qualified string was broken down into 2 parts
-            appendResetTemp();
-
-            stringAtTempBoundary = false;
-            //there could be more to this string in fscontent/buffer
-        }
-
-        boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
-        int newCurLen = curStringLen + tempStringLen;
-
-        while (newCurLen < len) {
-            //need to extract more strings
-            if (readBufOffset > bytesInReadBuf - 1) {
-                //no more bytes to process into strings, read them
-                try {
-                    bytesInReadBuf = 0;
-                    bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
-                } catch (TskException ex) {
-                    if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
-                        appendResetTemp();
-                        //have some extracted string, return that, and fail next time
-                        isEOF = true;
-                        int copied = copyToReturn(b, off, len);
-                        return copied;
-                    } else {
-                        return -1; //EOF
-                    }
-                }
-                if (bytesInReadBuf < 1) {
-                    if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
-                        appendResetTemp();
-                        //have some extracted string, return that, and fail next time
-                        isEOF = true;
-                        int copied = copyToReturn(b, off, len);
-                        return copied;
-                    } else {
-                        return -1; //EOF
-                    }
-                }
-                //increment content offset for next read
-                contentOffset += bytesInReadBuf;
-                //reset read buf position
-                readBufOffset = 0;
-            }
-            //get char from cur read buf
-            char c = (char) curReadBuf[readBufOffset++];
-            if (c == 0 && singleConsecZero == false) {
-                //preserve the current sequence if max consec. 1 zero char 
-                singleConsecZero = true;
-            } else {
-                singleConsecZero = false;
-            }
-            if (StringExtract.isPrintableAscii(c)) {
-                tempString.append(c);
-                ++tempStringLen;
-                if (tempStringLen >= MIN_PRINTABLE_CHARS) {
-                    inString = true;
-                }
-
-                //boundary case when temp has still chars - handled after the loop
-            } else if (!singleConsecZero) {
-                //break the string, clear temp
-                if (tempStringLen >= MIN_PRINTABLE_CHARS
-                        || stringAtBufBoundary) {
-                    //append entire temp string with new line
-                    tempString.append(NLS);
-                    ++tempStringLen;
-
-                    curString.append(tempString);
-                    curStringLen += tempStringLen;
-
-                    stringAtBufBoundary = false;
-                }
-                //reset temp
-                tempString = new StringBuilder();
-                tempStringLen = 0;
-            }
-
-            newCurLen = curStringLen + tempStringLen;
-        }
-
-        //check if still in string state, so that next chars in read buf bypass min chars check
-        //and qualify as string even if less < min chars required
-        if (inString) {
-            inString = false; //reset
-            stringAtBufBoundary = true; //will bypass the check
-        }
-
-        //check if temp still has chars to qualify as a string
-        //we might need to break up temp into 2 parts for next read() call
-        //consume as many as possible to fill entire user buffer
-        if (tempStringLen >= MIN_PRINTABLE_CHARS) {
-            if (newCurLen > len) {
-                int appendChars = len - curStringLen;
-                //save part for next user read(), need to break up temp string
-                //do not append new line
-                String toAppend = tempString.substring(0, appendChars);
-                String newTemp = tempString.substring(appendChars);
-
-                curString.append(toAppend);
-                curStringLen += appendChars;
-
-                tempString = new StringBuilder(newTemp);
-                tempStringLen = newTemp.length();
-
-                stringAtTempBoundary = true;
-
-            } else {
-                //append entire temp
-                curString.append(tempString);
-                curStringLen += tempStringLen;
-
-                //reset temp
-                tempString = new StringBuilder();
-                tempStringLen = 0;
-
-            }
-        } else {
-            //if temp has a few chars, not qualified as string for now, 
-            //will be processed during next read() call
-        }
-
-        //copy current strings to user
-        final int copied = copyToReturn(b, off, len);
-        //there may be still chars in read buffer or  tempString, for next read()
-
-        return copied;
-    }
-
-    //append temp buffer to cur string buffer and reset temp, if enough chars
-    //does not append new line
-    private void appendResetTemp() {
-        if (tempStringLen >= MIN_PRINTABLE_CHARS) {
-            curString.append(tempString);
-            curStringLen += tempStringLen;
-            tempString = new StringBuilder();
-            tempStringLen = 0;
-        }
-    }
-
-    //copy currently extracted string to user buffer
-    //and reset for next read() call
-    private int copyToReturn(byte[] b, int off, long len) {
-
-        final String curStringS = curString.toString();
-        //logger.log(Level.INFO, curStringS);
-        byte[] stringBytes = curStringS.getBytes(outputCharset);
-        System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
-        //logger.log(Level.INFO, curStringS);
-        //copied all string, reset
-        curString = new StringBuilder();
-        int ret = curStringLen;
-        curStringLen = 0;
-        return ret;
-
-    }
-
-    @Override
-    public int read() throws IOException {
-        final int read = read(oneCharBuf, 0, 1);
-        if (read == 1) {
-            return oneCharBuf[0];
-        } else {
-            return -1;
-        }
-
-    }
-
-    @Override
-    public int available() throws IOException {
-        //we don't know how many bytes in curReadBuf may end up as strings
-        return 0;
-    }
-
-    @Override
-    public long skip(long n) throws IOException {
-        //use default implementation that reads into skip buffer
-        //but it could be more efficient
-        return super.skip(n);
-    }
-}

From c94d3de87240ea9451c0d8cba24399dfa9a2d311 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Wed, 14 Dec 2016 17:17:45 +0100
Subject: [PATCH 15/21] move encoding options to StringsTextExtractor

---
 .../autopsy/keywordsearch/FileTextExtractor.java |  8 --------
 ...KeywordSearchGlobalLanguageSettingsPanel.java | 12 ++++++------
 .../KeywordSearchJobSettingsPanel.java           |  4 ++--
 .../keywordsearch/KeywordSearchSettings.java     |  9 ++++-----
 .../keywordsearch/StringsTextExtractor.java      | 16 +++++++++++-----
 5 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
index bec832f8df..2103ebb5a4 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@@ -28,14 +28,6 @@ import org.sleuthkit.datamodel.AbstractFile;
  */
 abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
 
-    /**
-     * Common options that can be used by some extractors
-     */
-    enum ExtractOptions {
-
-        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-    };
 
     static final List<String> BLOB_MIME_TYPES
             = Arrays.asList(
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
index c1f1e2a5a7..9e58235318 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
@@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     private void reloadScriptsCheckBoxes() {
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
         enableUTF8Checkbox.setSelected(utf8);
 
         final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
@@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
         reloadScriptsCheckBoxes();
 
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
         enableUTF8Checkbox.setSelected(utf8);
         final boolean extractEnabled = utf16 || utf8;
 
@@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     @Override
     public void store() {
-        KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
+        KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
                 Boolean.toString(enableUTF8Checkbox.isSelected()));
-        KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
+        KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
                 Boolean.toString(enableUTF16Checkbox.isSelected()));
 
         if (toUpdate != null) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
index 123878e92f..5515b10115 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
@@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
     }
 
     private void displayEncodings() {
-        String utf8 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
-        String utf16 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
+        String utf8 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
+        String utf16 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
         ArrayList<String> encodingsList = new ArrayList<>();
         if (utf8 == null || Boolean.parseBoolean(utf8)) {
             encodingsList.add("UTF8");
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
index cd5702722b..0550978442 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
@@ -23,7 +23,6 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Level;
-
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.ModuleSettings;
@@ -211,14 +210,14 @@ class KeywordSearchSettings {
             KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
         }
         //setting default Extract UTF8
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
             logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
         }
         //setting default Extract UTF16
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
             logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
         }
         //setting default Latin-1 Script
         if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
index 165bcd7591..8bf8d21910 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@@ -38,8 +38,14 @@ import org.sleuthkit.datamodel.TskException;
  * Solr.
  */
 class StringsTextExtractor extends FileTextExtractor<Void> {
+    /**
+     * Common options that can be used by some extractors
+     */
+    enum ExtractOptions {
 
-    private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
+        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
+        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
+    };
     private final List<SCRIPT> extractScripts = new ArrayList<>();
     private Map<String, String> extractOptions = new HashMap<>();
 
@@ -92,8 +98,8 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
 
     @Override
     boolean noExtractionOptionsAreEnabled() {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
 
         return extractUTF8 == false && extractUTF16 == false;
     }
@@ -118,8 +124,8 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
      */
     @Override
     InputStream getInputStream(AbstractFile sourceFile) {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
 
         //check which extract stream to use
         InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)

From 2d5cd2efc13e8c1df2963bf9cfc5e9c31978a9aa Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Thu, 15 Dec 2016 13:00:02 +0100
Subject: [PATCH 16/21] comment up Ingester

---
 .../autopsy/keywordsearch/Ingester.java       | 126 +++++++++++++-----
 1 file changed, 89 insertions(+), 37 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 566461c185..6a1e458580 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -92,7 +92,7 @@ class Ingester {
      *                           file, but the Solr server is probably fine.
      */
     void indexMetaDataOnly(AbstractFile file) throws IngesterException {
-        indexChunk(null, file.getName(), getContentFields(file), 0);
+        indexChunk("", file.getName(), getContentFields(file));
     }
 
     /**
@@ -106,17 +106,18 @@ class Ingester {
      *                           artifact, but the Solr server is probably fine.
      */
     void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
-        indexChunk(null, new ArtifactTextExtractor().getName(artifact), getContentFields(artifact), 0);
+        indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact));
     }
 
     /**
-     * Creates a field map from FsContent, that is later sent to Solr
+     * Creates a field map from a SleuthkitVisitableItem, that is later sent to
+     * Solr.
      *
      * @param item SleuthkitVisitableItem to get fields from
      *
-     * @return the map
+     * @return the map from field name to value (as a string)
      */
-    Map<String, String> getContentFields(SleuthkitVisitableItem item) {
+    private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
         return item.accept(SOLR_FIELDS_VISITOR);
     }
 
@@ -132,17 +133,17 @@ class Ingester {
 
         @Override
         public Map<String, String> visit(File f) {
-            return getFileFields(f);
+            return getCommonAndMACTimeFields(f);
         }
 
         @Override
         public Map<String, String> visit(DerivedFile df) {
-            return getFileFields(df);
+            return getCommonAndMACTimeFields(df);
         }
 
         @Override
         public Map<String, String> visit(Directory d) {
-            return getFileFields(d);
+            return getCommonAndMACTimeFields(d);
         }
 
         @Override
@@ -153,15 +154,24 @@ class Ingester {
 
         @Override
         public Map<String, String> visit(LocalFile lf) {
-            return getFileFields(lf);
+            return getCommonAndMACTimeFields(lf);
         }
 
         @Override
         public Map<String, String> visit(SlackFile f) {
-            return getFileFields(f);
+            return getCommonAndMACTimeFields(f);
         }
 
-        private Map<String, String> getFileFields(AbstractFile file) {
+        /**
+         * Get the field map for AbstractFiles that includes MAC times and the
+         * fields that are common to all file classes.
+         *
+         * @param file The file to get fields for
+         *
+         * @return The field map, including MAC times and common fields, for the
+         *         give file.
+         */
+        private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
             Map<String, String> params = getCommonFields(file);
             params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
             params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
@@ -170,6 +180,14 @@ class Ingester {
             return params;
         }
 
+        /**
+         * Get the field map for AbstractFiles that is common to all file
+         * classes
+         *
+         * @param file The file to get fields for
+         *
+         * @return The field map of fields that are common to all file classes.
+         */
         private Map<String, String> getCommonFields(AbstractFile af) {
             Map<String, String> params = new HashMap<>();
             params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
@@ -183,6 +201,13 @@ class Ingester {
             return params;
         }
 
+        /**
+         * Get the field map for artifacts.
+         *
+         * @param artifact The artifact to get fields for.
+         *
+         * @return The field map for the given artifact.
+         */
         @Override
         public Map<String, String> visit(BlackboardArtifact artifact) {
             Map<String, String> params = new HashMap<>();
@@ -197,33 +222,62 @@ class Ingester {
         }
     }
 
-
-    public <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
+    /**
+     * Use the given TextExtractor to extract text from the given source. The
+     * text will be chunked and each chunk passed to Solr to add to the index.
+     *
+     *
+     * @param <A>       The type of the Appendix provider that provides
+     *                  additional text to append to the final chunk.
+     * @param <T>       A subclass of SleuthkitVisibleItem.
+     * @param extractor The TextExtractor that will be used to extract text from
+     *                  the given source.
+     * @param source    The source from which text will be extracted, chunked,
+     *                  and indexed.
+     * @param context   The ingest job context that can be used to cancel this
+     *                  process.
+     *
+     * @return True if this method executed normally. or False if there was an
+     *         unexpected exception. //JMTODO: This policy needs to be reviewed.
+     *
+     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+     */
+    <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
         final long sourceID = extractor.getID(source);
         final String sourceName = extractor.getName(source);
 
         int numChunks = 0; //unknown until chunking is done
 
         if (extractor.noExtractionOptionsAreEnabled()) {
+            /* some Extrctors, notable the strings extractor, have options which
+             * can be configured such that no extraction should be done */
             return true;
         }
 
         Map<String, String> fields = getContentFields(source);
+        // the appendix will be used to add "meta data" to the end of the last chunk
+        /* JMTODO: we need to figure out how to account for this so the last
+         * chunk doesn't go past 32K
+         *
+         * JM: one idea: push the appendix into the stream that the text
+         * extractor provides so it is automatically chunked with the rest of
+         * the content JMTODO: should this really be in the index at all?
+         */ A appendix = extractor.newAppendixProvider();
 
-        A appendix = extractor.newAppendixProvider();
+        //Get a stream and a reader for that stream
         try (final InputStream stream = extractor.getInputStream(source);
                 Reader reader = extractor.getReader(stream, source, appendix);) {
 
-            //we read max 1024 chars at time, this seems to max what this Reader would return
+            //we read max 1024 chars at time, this seems to max what some Readers would return
             char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
 
-            boolean eof = false;
+            boolean eof = false;  //have we read until the end of the file yet
             while (!eof) {
-                int chunkSizeInChars = 0;
+                int chunkSizeInChars = 0;  // the size in chars of the chunk (so far)
                 if (context != null && context.fileIngestIsCancelled()) {
                     return true;
                 }
-                long charsRead = 0;
+                long charsRead = 0;  // number of chars read in the most recent read operation
                 //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
                 while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
                         && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
@@ -236,6 +290,7 @@ class Ingester {
                 } else {
                     chunkSizeInChars += charsRead;
 
+                    //if we haven't reached the end of the file,
                     //try to read char-by-char until whitespace to not break words
                     while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
                             && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
@@ -257,14 +312,14 @@ class Ingester {
                 } else {
                     sb = new StringBuilder(chunkSizeInChars)
                             .append(textChunkBuf, 0, chunkSizeInChars);
-
                 }
 
-                sanitizeToUTF8(sb);
+                sanitizeToUTF8(sb);   //replace non UTF8 chars with '^'
 
                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                 fields.put(Server.Schema.ID.toString(), chunkId);
                 try {
+                    //pass the chunk to method that adds it to Solr index
                     indexChunk(sb.toString(), sourceName, fields, sb.length());
                     numChunks++;
                 } catch (Ingester.IngesterException ingEx) {
@@ -283,19 +338,19 @@ class Ingester {
             extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
             return false;
         } finally {
-            //after all chunks, ingest the parent file without content itself, and store numChunks
+            //after all chunks, index just the meta data, including the  numChunks, of the parent file
             fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
-            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
+            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
             indexChunk(null, sourceName, fields, 0);
         }
         return true;
     }
 
     /**
-     * Sanitize the given chars by replacing non-UTF-8 characters with caret '^'
+     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
+     * caret '^'
      *
-     * @param totalRead    the number of chars in textChunkBuf
-     * @param textChunkBuf the characters to sanitize
+     * @param sb the StringBuilder to sanitize
      *
      * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
      * function?
@@ -305,29 +360,26 @@ class Ingester {
 
         // Sanitize by replacing non-UTF-8 characters with caret '^'
         for (int i = 0; i < length; i++) {
-            if (!TextUtil.isValidSolrUTF8(sb.charAt(i))) {
-                sb.replace(i, i + 1, "^'");
+            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
+                sb.replace(i, i + 1, '^');
             }
         }
     }
 
     /**
-     * Indexing method that bypasses Tika, assumes pure text It reads and
-     * converts the entire content stream to string, assuming UTF8 since we
-     * can't use streaming approach for Solr /update handler. This should be
-     * safe, since all content is now in max 1MB chunks.
+     * Add one chunk as to the Solr index as a seperate sold document.
      *
      * TODO see if can use a byte or string streaming way to add content to
      * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
      * 4.0.0), see if possible to stream with UpdateRequestHandler
      *
-     * @param cs
+     * @param chunk  The chunk content as a string
      * @param fields
      * @param size
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    void indexChunk(String chunk, String sourceName, Map<String, String> fields, int size) throws IngesterException {
+    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
             //JMTODO: actually if the we couldn't get the image id it is set to -1,
             // but does this really mean we don't want to index it?
@@ -340,14 +392,14 @@ class Ingester {
             throw new IngesterException(msg);
         }
 
+        //Make a SolrInputDocument out of the field map
         SolrInputDocument updateDoc = new SolrInputDocument();
         for (String key : fields.keySet()) {
             updateDoc.addField(key, fields.get(key));
         }
-
-        //using size here, but we are no longer ingesting entire files
-        //size is normally a chunk size, up to 1MB
-        updateDoc.addField(Server.Schema.CONTENT.toString(), (size > 0) ? chunk : "");
+        //add the content to the SolrInputDocument
+        //JMTODO: can we just add it to the field map before passing that in?
+        updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
 
         try {
             //TODO: consider timeout thread, or vary socket timeout based on size of indexed content

From 8841f6e7731de0773a78c130e29ca3476d784e0f Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Thu, 15 Dec 2016 22:02:58 +0100
Subject: [PATCH 17/21] minor fixes

---
 .../src/org/sleuthkit/autopsy/keywordsearch/Ingester.java   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 6a1e458580..229a751f76 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -320,7 +320,7 @@ class Ingester {
                 fields.put(Server.Schema.ID.toString(), chunkId);
                 try {
                     //pass the chunk to method that adds it to Solr index
-                    indexChunk(sb.toString(), sourceName, fields, sb.length());
+                    indexChunk(sb.toString(), sourceName, fields);
                     numChunks++;
                 } catch (Ingester.IngesterException ingEx) {
                     extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
@@ -341,7 +341,7 @@ class Ingester {
             //after all chunks, index just the meta data, including the  numChunks, of the parent file
             fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
             fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
-            indexChunk(null, sourceName, fields, 0);
+            indexChunk(null, sourceName, fields);
         }
         return true;
     }
@@ -361,7 +361,7 @@ class Ingester {
         // Sanitize by replacing non-UTF-8 characters with caret '^'
         for (int i = 0; i < length; i++) {
             if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
-                sb.replace(i, i + 1, '^');
+                sb.replace(i, i + 1, "^");
             }
         }
     }

From f56c2b43c88c3c1737a0a6bb2585846a2a074233 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Fri, 16 Dec 2016 14:24:01 +0100
Subject: [PATCH 18/21] move all 'appendix' related code into TikaTextExtractor
 and simplify TextExtractor interface.

---
 .../keywordsearch/ArtifactTextExtractor.java  |  8 +-
 .../keywordsearch/FileTextExtractor.java      |  8 +-
 .../keywordsearch/HtmlTextExtractor.java      |  9 +--
 .../autopsy/keywordsearch/Ingester.java       | 25 +-----
 .../KeywordSearchIngestModule.java            |  6 +-
 .../keywordsearch/StringsTextExtractor.java   | 10 +--
 .../autopsy/keywordsearch/TextExtractor.java  | 10 +--
 .../keywordsearch/TikaTextExtractor.java      | 80 ++++++++++++-------
 8 files changed, 74 insertions(+), 82 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
index 0c1caeebe2..501971e2e6 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@@ -33,7 +33,7 @@ import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
 
-public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifact> {
+public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
     static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
 
     static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
@@ -70,10 +70,6 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
         return false;
     }
 
-    @Override
-    Void newAppendixProvider() {
-        return null;
-    }
 
     @Override
     InputStream getInputStream(BlackboardArtifact artifact) {
@@ -118,7 +114,7 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
     }
 
     @Override
-    Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
+    Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
         return new InputStreamReader(stream);
     }
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
index 2103ebb5a4..cf268dc1e4 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@@ -18,6 +18,8 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import java.io.InputStream;
+import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
@@ -26,7 +28,7 @@ import org.sleuthkit.datamodel.AbstractFile;
  * Common methods for utilities that extract text and content and divide into
  * chunks
  */
-abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
+abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
 
 
     static final List<String> BLOB_MIME_TYPES
@@ -93,6 +95,9 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
      */
     abstract boolean isSupported(AbstractFile file, String detectedFormat);
 
+    @Override
+    abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
+
     @Override
     long getID(AbstractFile source) {
         return source.getId();
@@ -103,4 +108,5 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
     String getName(AbstractFile source) {
         return source.getName();
     }
+
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
index ffaf9a32b2..956642ceed 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * divided into chunks and indexed with Solr. If HTML extraction succeeds,
  * chunks are indexed with Solr.
  */
-class HtmlTextExtractor extends FileTextExtractor<Void> {
+class HtmlTextExtractor extends FileTextExtractor {
 
     static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
     private static final int MAX_SIZE = 50000000;
@@ -54,7 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
     HtmlTextExtractor() {
     }
 
-
     @Override
     boolean isContentTypeSpecific() {
         return true;
@@ -76,7 +75,7 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
      * @throws IOException if There is an IOException parsing the input stream.
      */
     @Override
-    Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException {
+    Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
         try {
             StringBuilder scripts = new StringBuilder();
             StringBuilder links = new StringBuilder();
@@ -172,10 +171,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
     }
 
     @Override
-    Void newAppendixProvider() {
-        return null;
-    }
-
     InputStream getInputStream(AbstractFile sourceFile1) {
         return new ReadContentInputStream(sourceFile1);
     }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 229a751f76..2bb20c6060 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -242,7 +242,7 @@ class Ingester {
      *
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
-    <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
+    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
         final long sourceID = extractor.getID(source);
         final String sourceName = extractor.getName(source);
 
@@ -255,18 +255,9 @@ class Ingester {
         }
 
         Map<String, String> fields = getContentFields(source);
-        // the appendix will be used to add "meta data" to the end of the last chunk
-        /* JMTODO: we need to figure out how to account for this so the last
-         * chunk doesn't go past 32K
-         *
-         * JM: one idea: push the appendix into the stream that the text
-         * extractor provides so it is automatically chunked with the rest of
-         * the content JMTODO: should this really be in the index at all?
-         */ A appendix = extractor.newAppendixProvider();
-
         //Get a stream and a reader for that stream
         try (final InputStream stream = extractor.getInputStream(source);
-                Reader reader = extractor.getReader(stream, source, appendix);) {
+                Reader reader = extractor.getReader(stream, source);) {
 
             //we read max 1024 chars at time, this seems to max what some Readers would return
             char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
@@ -303,16 +294,8 @@ class Ingester {
                     }
                 }
 
-                StringBuilder sb;
-                if (eof) {
-                    //1000 char buffer is to allow for appendix data with out needing to resize the string builder.
-                    sb = new StringBuilder(chunkSizeInChars + 1000)
-                            .append(textChunkBuf, 0, chunkSizeInChars);
-                    extractor.appendDataToFinalChunk(sb, appendix);
-                } else {
-                    sb = new StringBuilder(chunkSizeInChars)
-                            .append(textChunkBuf, 0, chunkSizeInChars);
-                }
+                StringBuilder sb = new StringBuilder(chunkSizeInChars)
+                        .append(textChunkBuf, 0, chunkSizeInChars);
 
                 sanitizeToUTF8(sb);   //replace non UTF8 chars with '^'
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index ceb810c444..ad70144aab 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private List<FileTextExtractor<?>> textExtractors;
+    private List<FileTextExtractor> textExtractors;
     private StringsTextExtractor stringExtractor;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
@@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            FileTextExtractor<?> extractor = null;
+            FileTextExtractor extractor = null;
 
             //go over available text extractors in order, and pick the first one (most specific one)
-            for (FileTextExtractor<?> fe : textExtractors) {
+            for (FileTextExtractor fe : textExtractors) {
                 if (fe.isSupported(aFile, detectedFormat)) {
                     extractor = fe;
                     break;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
index 8bf8d21910..97c5307138 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException;
  * with the original source file) up to 1MB then and indexes chunks as text with
  * Solr.
  */
-class StringsTextExtractor extends FileTextExtractor<Void> {
+class StringsTextExtractor extends FileTextExtractor {
     /**
      * Common options that can be used by some extractors
      */
@@ -105,7 +105,7 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
     }
 
     @Override
-    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException {
+    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
         return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
     }
 
@@ -145,12 +145,6 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
         return true;
     }
 
-
-    @Override
-    Void newAppendixProvider() {
-        return null;
-    }
-
     /**
      * AbstractFile input string stream reader/converter - given AbstractFile,
      * extract strings from it and return encoded bytes via read()
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index 2e1d3280bd..c4d808cdcd 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -24,7 +24,7 @@ import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 
-abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
+abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
 
     static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
     abstract boolean noExtractionOptionsAreEnabled();
@@ -33,15 +33,9 @@ abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisit
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
 
-    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
-        //no-op
-    }
-
-    abstract AppendixProvider newAppendixProvider();
-
     abstract InputStream getInputStream(TextSource source);
 
-    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
+    abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
 
     abstract long getID(TextSource source);
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
index 06d489363c..52bacc6962 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@@ -18,6 +18,8 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import com.google.common.io.CharSource;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.List;
@@ -39,16 +41,15 @@ import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
- * Extractor of text from TIKA supported AbstractFile content. Extracted text is
- * divided into chunks and indexed with Solr. Protects against Tika parser hangs
- * (for unexpected/corrupt content) using a timeout mechanism. If Tika
- * extraction succeeds, chunks are indexed with Solr.
+ * Extractor of text from TIKA supported AbstractFile content. Extracted text
+ * will be divided into chunks and indexed with Solr. Protects against Tika
+ * parser hangs (for unexpected/corrupt content) using a timeout mechanism. If
+ * Tika extraction succeeds, chunks are indexed with Solr.
  *
  * This Tika extraction/chunking utility is useful for large files of Tika
  * parsers-supported content type.
- *
  */
-class TikaTextExtractor extends FileTextExtractor<Metadata> {
+class TikaTextExtractor extends FileTextExtractor {
 
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
@@ -66,27 +67,15 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
     }
 
     @Override
-    Metadata newAppendixProvider() {
-        return new Metadata();
-    }
-
-    @Override
-    public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
-
-        //TODO: How do we account for this in chunking algorithm...
-        //JM: what if we always append it as a separate chunk?
-        sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
-        Stream.of(meta.names()).sorted().forEach(key -> {
-            sb.append(key).append(": ").append(meta.get(key)).append("\n");
-        });
-    }
-
-    @Override
-    Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException {
-        //Parse the file in a task
-        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
+    Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+        Metadata metadata = new Metadata();
+        //Parse the file in a task, a convenient way to have a timeout...
+        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
         try {
-            return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+            final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+            CharSource metaDataCharSource = getMetaDataCharSource(metadata);
+            //concatenate parsed content and meta data into a single reader.
+            return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
         } catch (TimeoutException te) {
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
             logWarning(msg, te);
@@ -99,8 +88,24 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
         }
     }
 
-    @Override
+    /**
+     * Get a CharSource that wraps a formated representation of the given
+     * Metadata.
+     *
+     * @param metadata The Metadata to wrap as a CharSource
+     *
+     * @returna CharSource for the given MetaData
+     */
+    static private CharSource getMetaDataCharSource(Metadata metadata) {
+        return CharSource.wrap(
+                new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
+                .append(Stream.of(metadata.names()).sorted()
+                        .map(key -> key + ": " + metadata.get(key))
+                        .collect(Collectors.joining("\n"))
+                ));
+    }
 
+    @Override
     public boolean isContentTypeSpecific() {
         return true;
     }
@@ -130,8 +135,9 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
     boolean noExtractionOptionsAreEnabled() {
         return false;
     }
+
     /**
-     * return timeout that should be used to index the content
+     * Return timeout that should be used to index the content.
      *
      * @param size size of the content
      *
@@ -152,4 +158,22 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
         }
 
     }
+
+    /**
+     * An implementation of CharSource that just wraps an existing reader and
+     * returns it in openStream().
+     */
+    private static class ReaderCharSource extends CharSource {
+
+        private final Reader reader;
+
+        public ReaderCharSource(Reader reader) {
+            this.reader = reader;
+        }
+
+        @Override
+        public Reader openStream() throws IOException {
+            return reader;
+        }
+    }
 }

From 8526427b4f07bdfa3e91b0870bb3ba5ac54d3d39 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Fri, 16 Dec 2016 14:37:59 +0100
Subject: [PATCH 19/21] cleanup and comment TextExtractor

cleanup and comment TextExtractor immplementations more.

remove constants left over from merge
---
 .../keywordsearch/ArtifactTextExtractor.java  | 33 ++++++----
 .../keywordsearch/HtmlTextExtractor.java      | 20 ++-----
 .../autopsy/keywordsearch/Ingester.java       |  4 +-
 .../keywordsearch/StringsTextExtractor.java   | 60 +++++++------------
 .../autopsy/keywordsearch/TextExtractor.java  | 52 +++++++++++++++-
 .../keywordsearch/TikaTextExtractor.java      |  5 +-
 6 files changed, 104 insertions(+), 70 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
index 501971e2e6..e01a64c41f 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@@ -21,8 +21,8 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.logging.Level;
 import org.apache.commons.io.IOUtils;
-import org.openide.util.Exceptions;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
@@ -33,11 +33,26 @@ import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
 
+/**
+ * Extracts text from artifacts by concatenating the values of all of the
+ * artifact's attributes.
+ */
 public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
     static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
 
+    /**
+     * Get the Content that is the data source for the given artifact. //JMTODO:
+     * is there a prexisting method to do this?
+     *
+     * @param artifact
+     *
+     * @return The data source for the given artifact as a Content object, or
+     *         null if it could not be found.
+     *
+     * @throws TskCoreException if there is a problem accessing the case db.
+     */
     static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
-        Content dataSource;
+
         Case currentCase;
         try {
             currentCase = Case.getCurrentCase();
@@ -49,11 +64,11 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
         SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
         if (sleuthkitCase == null) {
             return null;
-        }
 
+        }
+        Content dataSource;
         AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
         if (abstractFile != null) {
-
             dataSource = abstractFile.getDataSource();
         } else {
             dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
@@ -66,20 +81,19 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
     }
 
     @Override
-    boolean noExtractionOptionsAreEnabled() {
+    boolean isDisabled() {
         return false;
     }
 
 
     @Override
     InputStream getInputStream(BlackboardArtifact artifact) {
-
         // Concatenate the string values of all attributes into a single
         // "content" string to be indexed.
         StringBuilder artifactContents = new StringBuilder();
-        Content dataSource;
+
         try {
-            dataSource = getDataSource(artifact);
+            Content dataSource = getDataSource(artifact);
             if (dataSource == null) {
                 return null;
             }
@@ -102,7 +116,7 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
                 artifactContents.append(System.lineSeparator());
             }
         } catch (TskCoreException ex) {
-            Exceptions.printStackTrace(ex);
+            logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex);
             return null;
         }
         if (artifactContents.length() == 0) {
@@ -110,7 +124,6 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
         }
 
         return IOUtils.toInputStream(artifactContents);
-
     }
 
     @Override
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
index 956642ceed..cd786c6198 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@@ -33,14 +33,13 @@ import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
- * Extractor of text from HTML supported AbstractFile content. Extracted text is
- * divided into chunks and indexed with Solr. If HTML extraction succeeds,
+ * Extractor of text from HTML supported AbstractFile content. Extracted text
+ * will be * divided into chunks and indexed with Solr. If HTML extraction succeeds,
  * chunks are indexed with Solr.
  */
 class HtmlTextExtractor extends FileTextExtractor {
 
-    static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
-    private static final int MAX_SIZE = 50000000;
+    private static final int MAX_SIZE = 50_000_000; //50MB
 
     static final List<String> WEB_MIME_TYPES = Arrays.asList(
             "application/javascript", //NON-NLS
@@ -51,8 +50,6 @@ class HtmlTextExtractor extends FileTextExtractor {
             "text/javascript" //NON-NLS
     );
 
-    HtmlTextExtractor() {
-    }
 
     @Override
     boolean isContentTypeSpecific() {
@@ -66,16 +63,9 @@ class HtmlTextExtractor extends FileTextExtractor {
                 && file.getSize() <= MAX_SIZE;
     }
 
-    /** Parse the stream with Jericho and put the results in a Reader
-     *
-     * @param in an input stream for the content to be parsed by Jericho
-     *
-     * @return a Reader for the parsed content.
-     *
-     * @throws IOException if There is an IOException parsing the input stream.
-     */
     @Override
     Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
+        //Parse the stream with Jericho and put the results in a Reader
         try {
             StringBuilder scripts = new StringBuilder();
             StringBuilder links = new StringBuilder();
@@ -176,7 +166,7 @@ class HtmlTextExtractor extends FileTextExtractor {
     }
 
     @Override
-    boolean noExtractionOptionsAreEnabled() {
+    boolean isDisabled() {
         return false;
     }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 2bb20c6060..a19a829010 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -57,7 +57,7 @@ class Ingester {
 
     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
     private static final int SINGLE_READ_CHARS = 1024;
-    private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024;
+    private static final int EXTRA_CHARS = 128;
 
     private Ingester() {
     }
@@ -248,7 +248,7 @@ class Ingester {
 
         int numChunks = 0; //unknown until chunking is done
 
-        if (extractor.noExtractionOptionsAreEnabled()) {
+        if (extractor.isDisabled()) {
             /* some Extrctors, notable the strings extractor, have options which
              * can be configured such that no extraction should be done */
             return true;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
index 97c5307138..fcd4242a6c 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@@ -33,18 +33,16 @@ import org.sleuthkit.datamodel.TskCoreException;
 import org.sleuthkit.datamodel.TskException;
 
 /**
- * Takes an AbstractFile, extracts strings, converts into chunks (associated
- * with the original source file) up to 1MB then and indexes chunks as text with
- * Solr.
+ * TextExtractor that extracts raw strings from an AbstractFile.
  */
 class StringsTextExtractor extends FileTextExtractor {
+
     /**
-     * Common options that can be used by some extractors
+     * Options for this extractor
      */
     enum ExtractOptions {
-
-        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
+        EXTRACT_UTF16, ///< extract UTF16 text, true/false
+        EXTRACT_UTF8, ///< extract UTF8 text, true/false
     };
     private final List<SCRIPT> extractScripts = new ArrayList<>();
     private Map<String, String> extractOptions = new HashMap<>();
@@ -58,14 +56,10 @@ class StringsTextExtractor extends FileTextExtractor {
      * Sets the scripts to use for the extraction
      *
      * @param extractScripts scripts to use
-     *
-     * @return true if extractor supports script - specific extraction, false
-     *         otherwise
      */
-    public boolean setScripts(List<SCRIPT> extractScripts) {
+    public void setScripts(List<SCRIPT> extractScripts) {
         this.extractScripts.clear();
         this.extractScripts.addAll(extractScripts);
-        return true;
     }
 
     /**
@@ -97,7 +91,7 @@ class StringsTextExtractor extends FileTextExtractor {
     }
 
     @Override
-    boolean noExtractionOptionsAreEnabled() {
+    boolean isDisabled() {
         boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
         boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
 
@@ -109,29 +103,17 @@ class StringsTextExtractor extends FileTextExtractor {
         return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
     }
 
-    /**
-     * Get the appropriate input stream to read the content of the given
-     * AbstractFile.
-     *
-     * @return an appropriate input stream to read the content of the given
-     *         AbstractFile
-     *
-     * @param sourceFile   The AbstractFile to create an input stream for
-     * @param extractUTF8  Should the the stream extract UTF8
-     * @param extractUTF16 Should the the stream extract UTF16
-     *
-     * @return An InputStream for reading the contents of the AbstractFile
-     */
     @Override
     InputStream getInputStream(AbstractFile sourceFile) {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
-
         //check which extract stream to use
-        InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)
-                ? new AbstractFileStringStream(sourceFile)//optimal for english, english only
-                : new AbstractFileStringIntStream(sourceFile, extractScripts, extractUTF8, extractUTF16);
-        return stringStream;
+        if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
+            return new EnglishOnlyStream(sourceFile);//optimal for english, english only
+        } else {
+            boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
+            boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
+
+            return new InternationalStream(sourceFile, extractScripts, extractUTF8, extractUTF16);
+        }
     }
 
     @Override
@@ -157,9 +139,9 @@ class StringsTextExtractor extends FileTextExtractor {
      * AbstractFileStringIntStream streaming class, which wraps around
      * StringExtract extractor.
      */
-    private static class AbstractFileStringStream extends InputStream {
+    private static class EnglishOnlyStream extends InputStream {
 
-        private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
+        private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
         private static final String NLS = Character.toString((char) 10); //new line
         private static final int READ_BUF_SIZE = 256;
         private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
@@ -191,7 +173,7 @@ class StringsTextExtractor extends FileTextExtractor {
          *                      as, e.g. UTF-8
          *
          */
-        private AbstractFileStringStream(AbstractFile content) {
+        private EnglishOnlyStream(AbstractFile content) {
             this.content = content;
         }
 
@@ -380,9 +362,9 @@ class StringsTextExtractor extends FileTextExtractor {
      * stream of UTF-8 strings as encoded bytes.
      *
      */
-    private static class AbstractFileStringIntStream extends InputStream {
+    private static class InternationalStream extends InputStream {
 
-        private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
+        private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
         private static final int FILE_BUF_SIZE = 1024 * 1024;
         private final AbstractFile content;
         private final byte[] oneCharBuf = new byte[1];
@@ -410,7 +392,7 @@ class StringsTextExtractor extends FileTextExtractor {
          * @param extractUTF8  whether to extract utf8 encoding
          * @param extractUTF16 whether to extract utf16 encoding
          */
-        private AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
+        private InternationalStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
             this.content = content;
             this.stringExtractor = new StringExtract();
             this.stringExtractor.setEnabledScripts(scripts);
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
index c4d808cdcd..1dcfd9c361 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@@ -24,20 +24,70 @@ import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 
+/**
+ * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
+ * This Reader is given to the Ingester to chunk and index in Solr.
+ *
+ * @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
+ *                     is able to process.
+ */
 abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
 
     static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
-    abstract boolean noExtractionOptionsAreEnabled();
 
+    /**
+     * Is this extractor configured such that no extraction will/should be done?
+     *
+     * @return True if this extractor will/should not perform any extraction.
+     */
+    abstract boolean isDisabled();
+
+    /**
+     * Log the given message and exception as a warning.
+     *
+     * @param msg
+     * @param ex
+     */
     void logWarning(String msg, Exception ex) {
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
 
+    /**
+     * Get an input stream over the content of the given source.
+     *
+     * @param source
+     *
+     * @return
+     */
     abstract InputStream getInputStream(TextSource source);
 
+    /**
+     * Get a reader that over the text extracted from the given source.
+     *
+     * @param stream
+     * @param source
+     *
+     * @return
+     *
+     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+     */
     abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
 
+    /**
+     * Get the 'object' id of the given source.
+     *
+     * @param source
+     *
+     * @return
+     */
     abstract long getID(TextSource source);
 
+    /**
+     * Get a human readable name for the given source.
+     *
+     * @param source
+     *
+     * @return
+     */
     abstract String getName(TextSource source);
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
index 52bacc6962..6d2a112101 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@@ -51,7 +51,6 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  */
 class TikaTextExtractor extends FileTextExtractor {
 
-    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
 
     private static final List<String> TIKA_SUPPORTED_TYPES
@@ -132,7 +131,7 @@ class TikaTextExtractor extends FileTextExtractor {
     }
 
     @Override
-    boolean noExtractionOptionsAreEnabled() {
+    boolean isDisabled() {
         return false;
     }
 
@@ -143,7 +142,7 @@ class TikaTextExtractor extends FileTextExtractor {
      *
      * @return time in seconds to use a timeout
      */
-    static int getTimeout(long size) {
+    private static int getTimeout(long size) {
         if (size < 1024 * 1024L) //1MB
         {
             return 60;

From 4ae0a688bc44fc554c2531f45b112377baec105a Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Sat, 31 Dec 2016 14:31:11 +0100
Subject: [PATCH 20/21] don't commit unnecessarily

---
 .../org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 233549caed..60366eb482 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -61,8 +61,6 @@ public class SolrSearchService implements KeywordSearchService {
             ingester.indexText(extractor, artifact, null);
         } catch (Ingester.IngesterException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
-        } finally {
-            ingester.commit();
         }
     }
 

From 3557f141e1525a4d142b11e1334fdcb08cee8819 Mon Sep 17 00:00:00 2001
From: millmanorama <millmanorama@gmail.com>
Date: Mon, 2 Jan 2017 16:45:51 +0100
Subject: [PATCH 21/21] use UTF-8 encoding for ArtifactTextExtractor streams
 and readers

---
 .../autopsy/keywordsearch/ArtifactTextExtractor.java        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
index e01a64c41f..fc8ef12b05 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.nio.charset.StandardCharsets;
 import java.util.logging.Level;
 import org.apache.commons.io.IOUtils;
 import org.sleuthkit.autopsy.casemodule.Case;
@@ -122,13 +123,12 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
         if (artifactContents.length() == 0) {
             return null;
         }
-
-        return IOUtils.toInputStream(artifactContents);
+        return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
     }
 
     @Override
     Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
-        return new InputStreamReader(stream);
+        return new InputStreamReader(stream, StandardCharsets.UTF_8);
     }
 
     @Override