Merge in develop branch with text extraction refactoring

2025-07-16 17:57:43 +00:00 · 2017-01-08 10:48:17 -05:00 · 2017-01-08 10:48:17 -05:00 · be7bdced90
commit be7bdced90
parent b0ce3168df 837eb1477f
18 changed files with 1214 additions and 1892 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java
@ -1,91 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-2016 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.nio.charset.Charset;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
-
-/**
- * A representation of a chunk of text from a file that can be used, when
- * supplied with an Ingester, to index the chunk for search.
- */
-final class AbstractFileChunk {
-
-    private final int chunkNumber;
-    private final TextExtractor textExtractor;
-
-    /**
-     * Constructs a representation of a chunk of text from a file that can be
-     * used, when supplied with an Ingester, to index the chunk for search.
-     *
-     * @param textExtractor A TextExtractor for the file.
-     * @param chunkNumber   A sequence number for the chunk.
-     */
-    AbstractFileChunk(TextExtractor textExtractor, int chunkNumber) {
-        this.textExtractor = textExtractor;
-        this.chunkNumber = chunkNumber;
-    }
-
-    /**
-     * Gets the TextExtractor for the source file of the text chunk.
-     *
-     * @return A reference to the TextExtractor.
-     */
-    TextExtractor getTextExtractor() {
-        return textExtractor;
-    }
-
-    /**
-     * Gets the sequence number of the text chunk.
-     *
-     * @return The chunk number.
-     */
-    int getChunkNumber() {
-        return chunkNumber;
-    }
-
-    /**
-     * Gets the id of the text chunk.
-     *
-     * @return An id of the form [source file object id]_[chunk number]
-     */
-    String getChunkId() {
-        return Server.getChunkIdString(this.textExtractor.getSourceFile().getId(), this.chunkNumber);
-    }
-
-    /**
-     * Indexes the text chunk.
-     *
-     * @param ingester   An Ingester to do the indexing.
-     * @param chunkBytes The raw bytes of the text chunk.
-     * @param chunkSize  The size of the text chunk in bytes.
-     * @param charSet    The char set to use during indexing.
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-     */
-    void index(Ingester ingester, byte[] chunkBytes, long chunkSize, Charset charSet) throws IngesterException {
-        ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, textExtractor.getSourceFile(), charSet);
-        try {
-            ingester.ingest(this, bcs, chunkBytes.length);
-        } catch (Exception ex) {
-            throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", getChunkId()), ex);
-        }
-    }
-
-}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java
@ -1,92 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-2016 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.Charset;
-
-import org.openide.util.NbBundle;
-import org.apache.solr.common.util.ContentStream;
-import org.sleuthkit.datamodel.AbstractContent;
-import org.sleuthkit.datamodel.AbstractFile;
-
-/**
- * Wrapper over InputStream that implements ContentStream to feed to Solr.
- */
-class AbstractFileStringContentStream implements ContentStream {
-    //input
-
-    private final AbstractFile content;
-    private final Charset charset;
-    //converted
-    private final InputStream stream;
-
-    public AbstractFileStringContentStream(AbstractFile content, Charset charset, InputStream inputStream) {
-        this.content = content;
-        this.charset = charset;
-        this.stream = inputStream;
-    }
-
-    public AbstractContent getSourceContent() {
-        return content;
-    }
-
-    @Override
-    public String getContentType() {
-        return "text/plain;charset=" + charset.name(); //NON-NLS
-    }
-
-    @Override
-    public String getName() {
-        return content.getName();
-    }
-
-    @Override
-    public Reader getReader() throws IOException {
-        return new InputStreamReader(stream);
-
-    }
-
-    @Override
-    public Long getSize() {
-        //return convertedLength;
-        throw new UnsupportedOperationException(
-                NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSize.exception.msg"));
-    }
-
-    @Override
-    public String getSourceInfo() {
-        return NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSrcInfo.text", content.getId());
-    }
-
-    @Override
-    public InputStream getStream() throws IOException {
-        return stream;
-    }
-
-    @Override
-    protected void finalize() throws Throwable {
-        super.finalize();
-
-        stream.close();
-    }
-}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java
@ -1,213 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2012 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.List;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.TskCoreException;
-
-/**
- * Wrapper over StringExtract to provide streaming API Given AbstractFile
- * object, extract international strings from the file and read output as a
- * stream of UTF-8 strings as encoded bytes.
- *
- */
-class AbstractFileStringIntStream extends InputStream {
-
-    private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
-    private static final int FILE_BUF_SIZE = 1024 * 1024;
-    private AbstractFile content;
-    private final byte[] oneCharBuf = new byte[1];
-    private final StringExtract stringExtractor;
-    private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
-    private long fileReadOffset = 0L;
-    private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
-    private int convertBuffOffset = 0; //offset to start returning data to user on next read()
-    private int bytesInConvertBuff = 0; //amount of data currently in the buffer
-    private boolean fileEOF = false; //if file has more bytes to read
-    private boolean extractUTF8;
-    private boolean extractUTF16;
-    private Charset outCharset;
-
-    private StringExtractResult lastExtractResult;
-
-    /**
-     * Constructs new stream object that does conversion from file, to extracted
-     * strings, then to byte stream, for specified script, auto-detected
-     * encoding (UTF8, UTF16LE, UTF16BE), and specified output byte stream
-     * encoding
-     *
-     * @param content      input content to process and turn into a stream to
-     *                     convert into strings
-     * @param scripts      a list of scripts to consider
-     * @param extractUTF8  whether to extract utf8 encoding
-     * @param extractUTF16 whether to extract utf16 encoding
-     * @param outCharset   encoding to use in the output byte stream
-     */
-    public AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8,
-            boolean extractUTF16, Charset outCharset) {
-        this.content = content;
-        this.stringExtractor = new StringExtract();
-        this.stringExtractor.setEnabledScripts(scripts);
-        this.extractUTF8 = extractUTF8;
-        this.extractUTF16 = extractUTF16;
-        this.outCharset = outCharset;
-        this.stringExtractor.setEnableUTF8(extractUTF8);
-        this.stringExtractor.setEnableUTF16(extractUTF16);
-    }
-
-    @Override
-    public int read() throws IOException {
-        if (extractUTF8 == false && extractUTF16 == false) {
-            return -1;
-        }
-        final int read = read(oneCharBuf, 0, 1);
-        if (read == 1) {
-            return oneCharBuf[0];
-        } else {
-            return -1;
-        }
-
-    }
-
-    @Override
-    public int read(byte[] b, int off, int len) throws IOException {
-        if (b == null) {
-            throw new NullPointerException();
-        } else if (off < 0 || len < 0 || len > b.length - off) {
-            throw new IndexOutOfBoundsException();
-        } else if (len == 0) {
-            return 0;
-        }
-
-        if (extractUTF8 == false && extractUTF16 == false) {
-            return -1;
-        }
-
-        long fileSize = content.getSize();
-        if (fileSize == 0) {
-            return -1;
-        }
-
-        //read and convert until user buffer full
-        //we have data if file can be read or when byteBuff has converted strings to return
-        int bytesToUser = 0; //returned to user so far
-        int offsetUser = off;
-        while (bytesToUser < len && offsetUser < len) {
-            //check if we have enough converted strings         
-            int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
-
-            if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
-                try {
-                    //convert more strings, store in buffer
-                    long toRead = 0;
-                    //int shiftSize = 0;
-
-                    //if (lastExtractResult != null && lastExtractResult.getTextLength() != 0
-                    //      && (shiftSize = FILE_BUF_SIZE - lastExtractResult.getFirstUnprocessedOff()) > 0) {
-                    ////a string previously extracted
-                    ////shift the fileReadBuff past last bytes extracted
-                    ////read only what's needed to fill the buffer
-                    ////to avoid loosing chars and breaking or corrupting potential strings - preserve byte stream continuity
-                    //byte[] temp = new byte[shiftSize];
-                    //System.arraycopy(fileReadBuff, lastExtractResult.getFirstUnprocessedOff(),
-                    //        temp, 0, shiftSize);
-                    //System.arraycopy(temp, 0, fileReadBuff, 0, shiftSize);
-                    //toRead = Math.min(lastExtractResult.getFirstUnprocessedOff(), fileSize - fileReadOffset);
-                    //lastExtractResult = null;
-                    //} else { 
-                    //fill up entire fileReadBuff fresh
-                    toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
-                    //}
-                    int read = content.read(fileReadBuff, fileReadOffset, toRead);
-                    if (read == -1 || read == 0) {
-                        fileEOF = true;
-                    } else {
-                        fileReadOffset += read;
-                        if (fileReadOffset >= fileSize) {
-                            fileEOF = true;
-                        }
-
-                        //put converted string in convertBuff
-                        convert(read);
-                        convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
-                    }
-                } catch (TskCoreException ex) {
-                    //Exceptions.printStackTrace(ex);
-                    fileEOF = true;
-                }
-            }
-
-            //nothing more to read, and no more bytes in convertBuff
-            if (convertBuff == null || convertBuffRemain == 0) {
-                if (fileEOF) {
-                    return bytesToUser > 0 ? bytesToUser : -1;
-                } else {
-                    //no strings extracted, try another read
-                    continue;
-                }
-            }
-
-            //return part or all of convert buff to user
-            final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
-            System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
-
-            //DEBUG
-            /*
-             * if (toCopy > 0) { FileOutputStream debug = new
-             * FileOutputStream("c:\\temp\\" + content.getName(), true);
-             * debug.write(b, offsetUser, toCopy); debug.close(); }
-             */
-            convertBuffOffset += toCopy;
-            offsetUser += toCopy;
-
-            bytesToUser += toCopy;
-
-        }
-
-        //if more string data in convertBuff, will be consumed on next read()
-        return bytesToUser;
-    }
-
-    /**
-     * convert bytes in file buffer to string, and encode string in
-     * convertBuffer
-     *
-     * @param numBytes num bytes in the fileReadBuff
-     */
-    private void convert(int numBytes) {
-        lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
-        convertBuff = lastExtractResult.getText().getBytes(outCharset);
-
-        //reset tracking vars
-        if (lastExtractResult.getNumBytes() == 0) {
-            bytesInConvertBuff = 0;
-        } else {
-            bytesInConvertBuff = convertBuff.length;
-        }
-        convertBuffOffset = 0;
-    }
-}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java
@ -1,296 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2012 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.TskException;
-
-/**
- * AbstractFile input string stream reader/converter - given AbstractFile,
- * extract strings from it and return encoded bytes via read()
- *
- * Note: the utility supports extraction of only LATIN script and UTF8, UTF16LE,
- * UTF16BE encodings and uses a brute force encoding detection - it's fast but
- * could apply multiple encodings on the same string.
- *
- * For other script/languages support and better encoding detection use
- * AbstractFileStringIntStream streaming class, which wraps around StringExtract
- * extractor.
- */
-class AbstractFileStringStream extends InputStream {
-
-    //args
-    private AbstractFile content;
-    private Charset outputCharset;
-    //internal data
-    private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
-    private static final String NLS = Character.toString((char) 10); //new line
-    private static final int READ_BUF_SIZE = 256;
-    private long contentOffset = 0; //offset in fscontent read into curReadBuf    
-    private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
-    private int bytesInReadBuf = 0;
-    private int readBufOffset = 0; //offset in read buf processed
-    private StringBuilder curString = new StringBuilder();
-    private int curStringLen = 0;
-    private StringBuilder tempString = new StringBuilder();
-    private int tempStringLen = 0;
-    private boolean isEOF = false;
-    private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
-    private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
-    private boolean inString = false; //if current temp has min chars required
-    private final byte[] oneCharBuf = new byte[1];
-    private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
-
-    /**
-     * Construct new string stream from FsContent
-     *
-     * @param content                to extract strings from
-     * @param outputCharset          target encoding to index as
-     * @param preserveOnBuffBoundary whether to preserve or split string on a
-     *                               buffer boundary. If false, will pack into
-     *                               read buffer up to max. possible,
-     *                               potentially splitting a string. If false,
-     *                               the string will be preserved for next read.
-     */
-    public AbstractFileStringStream(AbstractFile content, Charset outputCharset, boolean preserveOnBuffBoundary) {
-        this.content = content;
-        this.outputCharset = outputCharset;
-        //this.preserveOnBuffBoundary = preserveOnBuffBoundary;
-        //logger.log(Level.INFO, "FILE: " + content.getParentPath() + "/" + content.getName());
-    }
-
-    /**
-     * Construct new string stream from FsContent Do not attempt to fill entire
-     * read buffer if that would break a string
-     *
-     * @param content    to extract strings from
-     * @param outCharset target charset to encode into bytes and index as, e.g.
-     *                   UTF-8
-     */
-    public AbstractFileStringStream(AbstractFile content, Charset outCharset) {
-        this(content, outCharset, false);
-    }
-
-    @Override
-    public int read(byte[] b, int off, int len) throws IOException {
-        if (b == null) {
-            throw new NullPointerException();
-        } else if (off < 0 || len < 0 || len > b.length - off) {
-            throw new IndexOutOfBoundsException();
-        } else if (len == 0) {
-            return 0;
-        }
-
-        long fileSize = content.getSize();
-        if (fileSize == 0) {
-            return -1;
-        }
-
-        if (isEOF) {
-            return -1;
-        }
-
-        if (stringAtTempBoundary) {
-            //append entire temp string residual from previous read()
-            //because qualified string was broken down into 2 parts
-            appendResetTemp();
-
-            stringAtTempBoundary = false;
-            //there could be more to this string in fscontent/buffer
-        }
-
-        boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
-        int newCurLen = curStringLen + tempStringLen;
-
-        while (newCurLen < len) {
-            //need to extract more strings
-            if (readBufOffset > bytesInReadBuf - 1) {
-                //no more bytes to process into strings, read them
-                try {
-                    bytesInReadBuf = 0;
-                    bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
-                } catch (TskException ex) {
-                    if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
-                        appendResetTemp();
-                        //have some extracted string, return that, and fail next time
-                        isEOF = true;
-                        int copied = copyToReturn(b, off, len);
-                        return copied;
-                    } else {
-                        return -1; //EOF
-                    }
-                }
-                if (bytesInReadBuf < 1) {
-                    if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
-                        appendResetTemp();
-                        //have some extracted string, return that, and fail next time
-                        isEOF = true;
-                        int copied = copyToReturn(b, off, len);
-                        return copied;
-                    } else {
-                        return -1; //EOF
-                    }
-                }
-                //increment content offset for next read
-                contentOffset += bytesInReadBuf;
-                //reset read buf position
-                readBufOffset = 0;
-            }
-            //get char from cur read buf
-            char c = (char) curReadBuf[readBufOffset++];
-            if (c == 0 && singleConsecZero == false) {
-                //preserve the current sequence if max consec. 1 zero char 
-                singleConsecZero = true;
-            } else {
-                singleConsecZero = false;
-            }
-            if (StringExtract.isPrintableAscii(c)) {
-                tempString.append(c);
-                ++tempStringLen;
-                if (tempStringLen >= MIN_PRINTABLE_CHARS) {
-                    inString = true;
-                }
-
-                //boundary case when temp has still chars - handled after the loop
-            } else if (!singleConsecZero) {
-                //break the string, clear temp
-                if (tempStringLen >= MIN_PRINTABLE_CHARS
-                        || stringAtBufBoundary) {
-                    //append entire temp string with new line
-                    tempString.append(NLS);
-                    ++tempStringLen;
-
-                    curString.append(tempString);
-                    curStringLen += tempStringLen;
-
-                    stringAtBufBoundary = false;
-                }
-                //reset temp
-                tempString = new StringBuilder();
-                tempStringLen = 0;
-            }
-
-            newCurLen = curStringLen + tempStringLen;
-        }
-
-        //check if still in string state, so that next chars in read buf bypass min chars check
-        //and qualify as string even if less < min chars required
-        if (inString) {
-            inString = false; //reset
-            stringAtBufBoundary = true; //will bypass the check
-        }
-
-        //check if temp still has chars to qualify as a string
-        //we might need to break up temp into 2 parts for next read() call
-        //consume as many as possible to fill entire user buffer
-        if (tempStringLen >= MIN_PRINTABLE_CHARS) {
-            if (newCurLen > len) {
-                int appendChars = len - curStringLen;
-                //save part for next user read(), need to break up temp string
-                //do not append new line
-                String toAppend = tempString.substring(0, appendChars);
-                String newTemp = tempString.substring(appendChars);
-
-                curString.append(toAppend);
-                curStringLen += appendChars;
-
-                tempString = new StringBuilder(newTemp);
-                tempStringLen = newTemp.length();
-
-                stringAtTempBoundary = true;
-
-            } else {
-                //append entire temp
-                curString.append(tempString);
-                curStringLen += tempStringLen;
-
-                //reset temp
-                tempString = new StringBuilder();
-                tempStringLen = 0;
-
-            }
-        } else {
-            //if temp has a few chars, not qualified as string for now, 
-            //will be processed during next read() call
-        }
-
-        //copy current strings to user
-        final int copied = copyToReturn(b, off, len);
-        //there may be still chars in read buffer or  tempString, for next read()
-
-        return copied;
-    }
-
-    //append temp buffer to cur string buffer and reset temp, if enough chars
-    //does not append new line
-    private void appendResetTemp() {
-        if (tempStringLen >= MIN_PRINTABLE_CHARS) {
-            curString.append(tempString);
-            curStringLen += tempStringLen;
-            tempString = new StringBuilder();
-            tempStringLen = 0;
-        }
-    }
-
-    //copy currently extracted string to user buffer
-    //and reset for next read() call
-    private int copyToReturn(byte[] b, int off, long len) {
-
-        final String curStringS = curString.toString();
-        //logger.log(Level.INFO, curStringS);
-        byte[] stringBytes = curStringS.getBytes(outputCharset);
-        System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
-        //logger.log(Level.INFO, curStringS);
-        //copied all string, reset
-        curString = new StringBuilder();
-        int ret = curStringLen;
-        curStringLen = 0;
-        return ret;
-
-    }
-
-    @Override
-    public int read() throws IOException {
-        final int read = read(oneCharBuf, 0, 1);
-        if (read == 1) {
-            return oneCharBuf[0];
-        } else {
-            return -1;
-        }
-
-    }
-
-    @Override
-    public int available() throws IOException {
-        //we don't know how many bytes in curReadBuf may end up as strings
-        return 0;
-    }
-
-    @Override
-    public long skip(long n) throws IOException {
-        //use default implementation that reads into skip buffer
-        //but it could be more efficient
-        return super.skip(n);
-    }
-}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@ -0,0 +1,143 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2016 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.logging.Level;
+import org.apache.commons.io.IOUtils;
+import org.sleuthkit.autopsy.casemodule.Case;
+import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.BlackboardAttribute;
+import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.SleuthkitCase;
+import org.sleuthkit.datamodel.TskCoreException;
+
+/**
+ * Extracts text from artifacts by concatenating the values of all of the
+ * artifact's attributes.
+ */
+public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
+    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
+
+    /**
+     * Get the Content that is the data source for the given artifact. //JMTODO:
+     * is there a prexisting method to do this?
+     *
+     * @param artifact
+     *
+     * @return The data source for the given artifact as a Content object, or
+     *         null if it could not be found.
+     *
+     * @throws TskCoreException if there is a problem accessing the case db.
+     */
+    static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
+
+        Case currentCase;
+        try {
+            currentCase = Case.getCurrentCase();
+        } catch (IllegalStateException ignore) {
+            // thorown by Case.getCurrentCase() if currentCase is null
+            return null;
+        }
+
+        SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
+        if (sleuthkitCase == null) {
+            return null;
+
+        }
+        Content dataSource;
+        AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
+        if (abstractFile != null) {
+            dataSource = abstractFile.getDataSource();
+        } else {
+            dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
+        }
+
+        if (dataSource == null) {
+            return null;
+        }
+        return dataSource;
+    }
+
+    @Override
+    boolean isDisabled() {
+        return false;
+    }
+
+
+    @Override
+    InputStream getInputStream(BlackboardArtifact artifact) {
+        // Concatenate the string values of all attributes into a single
+        // "content" string to be indexed.
+        StringBuilder artifactContents = new StringBuilder();
+
+        try {
+            Content dataSource = getDataSource(artifact);
+            if (dataSource == null) {
+                return null;
+            }
+
+            for (BlackboardAttribute attribute : artifact.getAttributes()) {
+                artifactContents.append(attribute.getAttributeType().getDisplayName());
+                artifactContents.append(" : ");
+                // We have also discussed modifying BlackboardAttribute.getDisplayString()
+                // to magically format datetime attributes but that is complicated by
+                // the fact that BlackboardAttribute exists in Sleuthkit data model
+                // while the utility to determine the timezone to use is in ContentUtils
+                // in the Autopsy datamodel.
+                switch (attribute.getValueType()) {
+                    case DATETIME:
+                        artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
+                        break;
+                    default:
+                        artifactContents.append(attribute.getDisplayString());
+                }
+                artifactContents.append(System.lineSeparator());
+            }
+        } catch (TskCoreException ex) {
+            logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex);
+            return null;
+        }
+        if (artifactContents.length() == 0) {
+            return null;
+        }
+        return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
+    }
+
+    @Override
+    Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
+        return new InputStreamReader(stream, StandardCharsets.UTF_8);
+    }
+
+    @Override
+    long getID(BlackboardArtifact source) {
+        return source.getArtifactID();
+    }
+
+    @Override
+    String getName(BlackboardArtifact source) {
+        return source.getDisplayName() + "_" + source.getArtifactID();
+    }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
@ -1,102 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.Charset;
-
-import org.openide.util.NbBundle;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.apache.solr.common.util.ContentStream;
-import org.sleuthkit.datamodel.AbstractContent;
-
-/**
- * Stream of bytes representing string with specified encoding to feed into Solr
- * as ContentStream
- */
-class ByteContentStream implements ContentStream {
-
-    //input
-    private byte[] content; //extracted subcontent
-    private long contentSize;
-    private AbstractContent aContent; //origin
-    private Charset charset; //output byte stream charset of encoded strings
-
-    private InputStream stream;
-
-    private static Logger logger = Logger.getLogger(ByteContentStream.class.getName());
-
-    public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent, Charset charset) {
-        this.content = content;
-        this.aContent = aContent;
-        this.charset = charset;
-        stream = new ByteArrayInputStream(content, 0, (int) contentSize);
-    }
-
-    public byte[] getByteContent() {
-        return content;
-    }
-
-    public AbstractContent getSourceContent() {
-        return aContent;
-    }
-
-    @Override
-    public String getContentType() {
-        return "text/plain;charset=" + charset.name(); //NON-NLS
-    }
-
-    @Override
-    public String getName() {
-        return aContent.getName();
-    }
-
-    @Override
-    public Reader getReader() throws IOException {
-        return new InputStreamReader(stream);
-
-    }
-
-    @Override
-    public Long getSize() {
-        return contentSize;
-    }
-
-    @Override
-    public String getSourceInfo() {
-        return NbBundle.getMessage(this.getClass(), "ByteContentStream.getSrcInfo.text", aContent.getId());
-    }
-
-    @Override
-    public InputStream getStream() throws IOException {
-        return stream;
-    }
-
-    @Override
-    protected void finalize() throws Throwable {
-        super.finalize();
-
-        stream.close();
-    }
-
-}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -0,0 +1,112 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2016 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.List;
+import org.sleuthkit.datamodel.AbstractFile;
+
+/**
+ * Common methods for utilities that extract text and content and divide into
+ * chunks
+ */
+abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
+
+
+    static final List<String> BLOB_MIME_TYPES
+            = Arrays.asList(
+                    //ignore binary blob data, for which string extraction will be used
+                    "application/octet-stream", //NON-NLS
+                    "application/x-msdownload"); //NON-NLS
+
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
+    static final List<String> ARCHIVE_MIME_TYPES
+            = Arrays.asList(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+
+    /**
+     * Determines if the extractor works only for specified types is
+     * supportedTypes() or whether is a generic content extractor (such as
+     * string extractor)
+     *
+     * @return
+     */
+    abstract boolean isContentTypeSpecific();
+
+    /**
+     * Determines if the file content is supported by the extractor if
+     * isContentTypeSpecific() returns true.
+     *
+     * @param file           to test if its content should be supported
+     * @param detectedFormat mime-type with detected format (such as text/plain)
+     *                       or null if not detected
+     *
+     * @return true if the file content is supported, false otherwise
+     */
+    abstract boolean isSupported(AbstractFile file, String detectedFormat);
+
+    @Override
+    abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
+
+    @Override
+    long getID(AbstractFile source) {
+        return source.getId();
+    }
+
+
+    @Override
+    String getName(AbstractFile source) {
+        return source.getName();
+    }
+
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2012-2013 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -21,36 +21,23 @@ package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.nio.charset.Charset;
+import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
-import java.util.Map;
-import java.util.logging.Level;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
+import net.htmlparser.jericho.Attributes;
+import net.htmlparser.jericho.Renderer;
+import net.htmlparser.jericho.Source;
+import net.htmlparser.jericho.StartTag;
+import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;

 /**
- * Extractor of text from HTML supported AbstractFile content. Extracted text is
- * divided into chunks and indexed with Solr. If HTML extraction succeeds,
- * chunks are indexed with Solr.
+ * Extracts text from AbstractFile HTML content.
 */
-class HtmlTextExtractor implements TextExtractor {
+class HtmlTextExtractor extends FileTextExtractor {

-    private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
-    private static Ingester ingester;
-    static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
-    static final int MAX_EXTR_TEXT_CHARS = 31 * 1024;
-    private static final int SINGLE_READ_CHARS = 1024;
-    private static final int EXTRA_CHARS = 128; //for whitespace    
-    private static final int MAX_SIZE = 50000000;
-    //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
-    private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
-    private AbstractFile sourceFile;
-    private int numChunks = 0;
+    private static final int MAX_SIZE = 50_000_000; //50MB

    static final List<String> WEB_MIME_TYPES = Arrays.asList(
            "application/javascript", //NON-NLS
@ -59,170 +46,124 @@ class HtmlTextExtractor implements TextExtractor {
            "text/css", //NON-NLS
            "text/html", //NON-NLS NON-NLS
            "text/javascript" //NON-NLS
-    //"application/xml",
-    //"application/xml-dtd",
    );

-    HtmlTextExtractor() {
-        ingester = Ingester.getDefault();
-    }
-
    @Override
-    public boolean setScripts(List<SCRIPT> extractScripts) {
-        return false;
-    }
-
-    @Override
-    public List<SCRIPT> getScripts() {
-        return null;
-    }
-
-    @Override
-    public Map<String, String> getOptions() {
-        return null;
-    }
-
-    @Override
-    public void setOptions(Map<String, String> options) {
-    }
-
-    @Override
-    public int getNumChunks() {
-        return numChunks;
-    }
-
-    @Override
-    public AbstractFile getSourceFile() {
-        return sourceFile;
-    }
-
-    @Override
-    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
-        this.sourceFile = sourceFile;
-        numChunks = 0; //unknown until indexing is done
-
-        boolean success = false;
-        Reader reader = null;
-
-        final InputStream stream = new ReadContentInputStream(sourceFile);
-
-        try {
-            // Parse the stream with Jericho
-            JerichoParserWrapper jpw = new JerichoParserWrapper(stream);
-            jpw.parse();
-            reader = jpw.getReader();
-
-            // In case there is an exception or parse() isn't called
-            if (reader == null) {
-                logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS
-                return false;
-            }
-
-            success = true;
-            long readSize;
-            long totalRead = 0;
-            boolean eof = false;
-            //we read max 1024 chars at time, this seems to max what this Reader would return
-            while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
-                if (context.fileIngestIsCancelled()) {
-                    ingester.ingest(this);
-                    return true;
-                }
-                totalRead += readSize;
-
-                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
-                while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
-                        && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
-                    totalRead += readSize;
-                }
-                if (readSize == -1) {
-                    //this is the last chunk
-                    eof = true;
-                } else {
-                    //try to read until whitespace to not break words
-                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
-                            && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
-                            && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
-                        totalRead += readSize;
-                    }
-                    if (readSize == -1) {
-                        //this is the last chunk
-                        eof = true;
-                    }
-                }
-
-                //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
-                //encode to bytes to index as byte stream
-                String extracted;
-
-                //add BOM and trim the 0 bytes
-                //set initial size to chars read + bom - try to prevent from resizing
-                StringBuilder sb = new StringBuilder((int) totalRead + 1000);
-                //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
-                //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
-                if (totalRead < MAX_EXTR_TEXT_CHARS) {
-                    sb.append(textChunkBuf, 0, (int) totalRead);
-                } else {
-                    sb.append(textChunkBuf);
-                }
-
-                //reset for next chunk
-                totalRead = 0;
-                extracted = sb.toString();
-
-                //converts BOM automatically to charSet encoding
-                byte[] encodedBytes = extracted.getBytes(outCharset);
-                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
-                try {
-                    chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
-                    ++this.numChunks;
-                } catch (Ingester.IngesterException ingEx) {
-                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS
-                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
-                    throw ingEx; //need to rethrow/return to signal error and move on
-                }
-            }
-        } catch (IOException ex) {
-            logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-            success = false;
-        } catch (Exception ex) {
-            logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-            success = false;
-        } finally {
-            try {
-                stream.close();
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
-            }
-            try {
-                if (reader != null) {
-                    reader.close();
-                }
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
-            }
-        }
-
-        //after all chunks, ingest the parent file without content itself, and store numChunks
-        ingester.ingest(this);
-        return success;
-    }
-
-    @Override
-    public boolean isContentTypeSpecific() {
+    boolean isContentTypeSpecific() {
        return true;
    }

    @Override
-    public boolean isSupported(AbstractFile file, String detectedFormat) {
-        if (detectedFormat == null) {
-            return false;
-        } else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {
-            return true;
-        } else {
-            return false;
-        }
+    boolean isSupported(AbstractFile file, String detectedFormat) {
+        return detectedFormat != null
+                && WEB_MIME_TYPES.contains(detectedFormat)
+                && file.getSize() <= MAX_SIZE;
+    }

+    @Override
+    Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
+        //Parse the stream with Jericho and put the results in a Reader
+        try {
+            StringBuilder scripts = new StringBuilder();
+            StringBuilder links = new StringBuilder();
+            StringBuilder images = new StringBuilder();
+            StringBuilder comments = new StringBuilder();
+            StringBuilder others = new StringBuilder();
+            int numScripts = 0;
+            int numLinks = 0;
+            int numImages = 0;
+            int numComments = 0;
+            int numOthers = 0;
+
+            Source source = new Source(in);
+            source.fullSequentialParse();
+            Renderer renderer = source.getRenderer();
+            renderer.setNewLine("\n");
+            renderer.setIncludeHyperlinkURLs(false);
+            renderer.setDecorateFontStyles(false);
+            renderer.setIncludeAlternateText(false);
+
+            String text = renderer.toString();
+            // Get all the tags in the source
+            List<StartTag> tags = source.getAllStartTags();
+
+            StringBuilder stringBuilder = new StringBuilder();
+            for (StartTag tag : tags) {
+                if (tag.getName().equals("script")) {                //NON-NLS
+                    // If the <script> tag has attributes
+                    numScripts++;
+                    scripts.append(numScripts).append(") ");
+                    if (tag.getTagContent().length() > 0) {
+                        scripts.append(tag.getTagContent()).append(" ");
+                    }
+                    // Get whats between the <script> .. </script> tags
+                    scripts.append(tag.getElement().getContent()).append("\n");
+
+                } else if (tag.getName().equals("a")) {
+                    //NON-NLS
+                    numLinks++;
+                    links.append(numLinks).append(") ");
+                    links.append(tag.getTagContent()).append("\n");
+
+                } else if (tag.getName().equals("img")) {
+                    //NON-NLS
+                    numImages++;
+                    images.append(numImages).append(") ");
+                    images.append(tag.getTagContent()).append("\n");
+
+                } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
+                    numComments++;
+                    comments.append(numComments).append(") ");
+                    comments.append(tag.getTagContent()).append("\n");
+
+                } else {
+                    // Make sure it has an attribute
+                    Attributes atts = tag.getAttributes();
+                    if (atts != null && atts.length() > 0) {
+                        numOthers++;
+                        others.append(numOthers).append(") ");
+                        others.append(tag.getName()).append(":");
+                        others.append(tag.getTagContent()).append("\n");
+
+                    }
+                }
+            }
+            stringBuilder.append(text).append("\n\n");
+            stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
+            if (numScripts > 0) {
+                stringBuilder.append("---Scripts---\n"); //NON-NLS
+                stringBuilder.append(scripts).append("\n");
+            }
+            if (numLinks > 0) {
+                stringBuilder.append("---Links---\n"); //NON-NLS
+                stringBuilder.append(links).append("\n");
+            }
+            if (numImages > 0) {
+                stringBuilder.append("---Images---\n"); //NON-NLS
+                stringBuilder.append(images).append("\n");
+            }
+            if (numComments > 0) {
+                stringBuilder.append("---Comments---\n"); //NON-NLS
+                stringBuilder.append(comments).append("\n");
+            }
+            if (numOthers > 0) {
+                stringBuilder.append("---Others---\n"); //NON-NLS
+                stringBuilder.append(others).append("\n");
+            }
+            // All done, now make it a reader
+            return new StringReader(stringBuilder.toString());
+        } catch (IOException ex) {
+            throw new Ingester.IngesterException("Error extracting HTML from content.", ex);
+        }
+    }
+
+    @Override
+    InputStream getInputStream(AbstractFile sourceFile1) {
+        return new ReadContentInputStream(sourceFile1);
+    }
+
+    @Override
+    boolean isDisabled() {
+        return false;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -18,49 +18,45 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.io.UnsupportedEncodingException;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.logging.Level;
 import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.SolrInputDocument;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.TextUtil;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
-import org.sleuthkit.datamodel.AbstractContent;
+import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.ContentVisitor;
+import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.DerivedFile;
 import org.sleuthkit.datamodel.Directory;
 import org.sleuthkit.datamodel.File;
 import org.sleuthkit.datamodel.LayoutFile;
 import org.sleuthkit.datamodel.LocalFile;
-import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.sleuthkit.datamodel.SlackFile;
+import org.sleuthkit.datamodel.SleuthkitItemVisitor;
+import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 import org.sleuthkit.datamodel.TskCoreException;

 /**
 * Handles indexing files on a Solr core.
 */
+//JMTODO: Should this class really be a singleton?
 class Ingester {

    private static final Logger logger = Logger.getLogger(Ingester.class.getName());
    private volatile boolean uncommitedIngests = false;
    private final Server solrServer = KeywordSearch.getServer();
-    private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
+    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
    private static Ingester instance;
-
-    //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
-    //TODO use a streaming way to add content to /update handler
-    private static final int MAX_DOC_CHUNK_SIZE = 32 * 1024;
-    private static final String ENCODING = "UTF-8"; //NON-NLS
+    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
+    private static final int SINGLE_READ_CHARS = 1024;
+    private static final int EXTRA_CHARS = 128;

    private Ingester() {
    }
@ -72,6 +68,7 @@ class Ingester {
        return instance;
    }

+    //JMTODO: this is probably useless
    @Override
    @SuppressWarnings("FinalizeDeclaration")
    protected void finalize() throws Throwable {
@ -84,123 +81,68 @@ class Ingester {
    }

    /**
-     * Sends a stream to Solr to have its content extracted and added to the
-     * index. commit() should be called once you're done ingesting files.
+     * Sends the metadata (name, MAC times, image id, etc) for the given file to
+     * Solr to be added to the index. commit() should be called once you're done
+     * indexing.
     *
-     * @param afscs File AbstractFileStringContentStream to ingest
+     * @param file File to index.
     *
     * @throws IngesterException if there was an error processing a specific
     *                           file, but the Solr server is probably fine.
     */
-    void ingest(AbstractFileStringContentStream afscs) throws IngesterException {
-        Map<String, String> params = getContentFields(afscs.getSourceContent());
-        ingest(afscs, params, afscs.getSourceContent().getSize());
+    void indexMetaDataOnly(AbstractFile file) throws IngesterException {
+        indexChunk("", file.getName(), getContentFields(file));
    }

    /**
-     * Sends a TextExtractor to Solr to have its content extracted and added to
-     * the index. commit() should be called once you're done ingesting files.
-     * FileExtract represents a parent of extracted file with actual content.
-     * The parent itself has no content, only meta data and is used to associate
-     * the extracted AbstractFileChunk
+     * Sends the metadata (artifact id, image id, etc) for the given artifact to
+     * Solr to be added to the index. commit() should be called once you're done
+     * indexing.
     *
-     * @param fe TextExtractor to ingest
+     * @param artifact The artifact to index.
     *
     * @throws IngesterException if there was an error processing a specific
-     *                           file, but the Solr server is probably fine.
+     *                           artifact, but the Solr server is probably fine.
     */
-    void ingest(TextExtractor fe) throws IngesterException {
-        Map<String, String> params = getContentFields(fe.getSourceFile());
-
-        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
-
-        ingest(new NullContentStream(fe.getSourceFile()), params, 0);
+    void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
+        indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact));
    }

    /**
-     * Sends a AbstractFileChunk to Solr and its extracted content stream to be
-     * added to the index. commit() should be called once you're done ingesting
-     * files. AbstractFileChunk represents a file chunk and its chunk content.
+     * Creates a field map from a SleuthkitVisitableItem, that is later sent to
+     * Solr.
     *
-     * @param fec  AbstractFileChunk to ingest
-     * @param size approx. size of the stream in bytes, used for timeout
-     *             estimation
+     * @param item SleuthkitVisitableItem to get fields from
     *
-     * @throws IngesterException if there was an error processing a specific
-     *                           file, but the Solr server is probably fine.
+     * @return the map from field name to value (as a string)
     */
-    void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException {
-        AbstractContent sourceContent = bcs.getSourceContent();
-        Map<String, String> params = getContentFields(sourceContent);
-
-        //overwrite id with the chunk id
-        params.put(Server.Schema.ID.toString(),
-                Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber()));
-
-        ingest(bcs, params, size);
+    private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
+        return item.accept(SOLR_FIELDS_VISITOR);
    }

    /**
-     * Sends a file to Solr to have its content extracted and added to the
-     * index. commit() should be called once you're done ingesting files. If the
-     * file is a directory or ingestContent is set to false, the file name is
-     * indexed only.
-     *
-     * @param file          File to ingest
-     * @param ingestContent if true, index the file and the content, otherwise
-     *                      indesx metadata only
-     *
-     * @throws IngesterException if there was an error processing a specific
-     *                           file, but the Solr server is probably fine.
+     * Visitor used to create fields to send to SOLR index.
     */
-    void ingest(AbstractFile file, boolean ingestContent) throws IngesterException {
-        if (ingestContent == false || file.isDir()) {
-            ingest(new NullContentStream(file), getContentFields(file), 0);
-        } else {
-            ingest(new FscContentStream(file), getContentFields(file), file.getSize());
-        }
-    }
-
-    /**
-     * Creates a field map from FsContent, that is later sent to Solr
-     *
-     * @param fsc FsContent to get fields from
-     *
-     * @return the map
-     */
-    private Map<String, String> getContentFields(AbstractContent fsc) {
-        return fsc.accept(getContentFieldsV);
-    }
-
-    /**
-     * Visitor used to create param list to send to SOLR index.
-     */
-    private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
+    static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {

        @Override
-        protected Map<String, String> defaultVisit(Content cntnt) {
+        protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
            return new HashMap<>();
        }

        @Override
        public Map<String, String> visit(File f) {
-            Map<String, String> params = getCommonFields(f);
-            getCommonFileContentFields(params, f);
-            return params;
+            return getCommonAndMACTimeFields(f);
        }

        @Override
        public Map<String, String> visit(DerivedFile df) {
-            Map<String, String> params = getCommonFields(df);
-            getCommonFileContentFields(params, df);
-            return params;
+            return getCommonAndMACTimeFields(df);
        }

        @Override
        public Map<String, String> visit(Directory d) {
-            Map<String, String> params = getCommonFields(d);
-            getCommonFileContentFields(params, d);
-            return params;
+            return getCommonAndMACTimeFields(d);
        }

        @Override
@ -211,19 +153,25 @@ class Ingester {

        @Override
        public Map<String, String> visit(LocalFile lf) {
-            Map<String, String> params = getCommonFields(lf);
-            getCommonFileContentFields(params, lf);
-            return params;
+            return getCommonAndMACTimeFields(lf);
        }

        @Override
        public Map<String, String> visit(SlackFile f) {
-            Map<String, String> params = getCommonFields(f);
-            getCommonFileContentFields(params, f);
-            return params;
+            return getCommonAndMACTimeFields(f);
        }

-        private Map<String, String> getCommonFileContentFields(Map<String, String> params, AbstractFile file) {
+        /**
+         * Get the field map for AbstractFiles that includes MAC times and the
+         * fields that are common to all file classes.
+         *
+         * @param file The file to get fields for
+         *
+         * @return The field map, including MAC times and common fields, for the
+         *         give file.
+         */
+        private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
+            Map<String, String> params = getCommonFields(file);
            params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
            params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
            params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file));
@ -231,140 +179,219 @@ class Ingester {
            return params;
        }

+        /**
+         * Get the field map for AbstractFiles that is common to all file
+         * classes
+         *
+         * @param file The file to get fields for
+         *
+         * @return The field map of fields that are common to all file classes.
+         */
        private Map<String, String> getCommonFields(AbstractFile af) {
            Map<String, String> params = new HashMap<>();
            params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
            try {
-                long dataSourceId = af.getDataSource().getId();
-                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(af.getDataSource().getId()));
            } catch (TskCoreException ex) {
-                logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS
+                logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId(), ex); //NON-NLS
                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
            }
-
            params.put(Server.Schema.FILE_NAME.toString(), af.getName());
            return params;
        }
+
+        /**
+         * Get the field map for artifacts.
+         *
+         * @param artifact The artifact to get fields for.
+         *
+         * @return The field map for the given artifact.
+         */
+        @Override
+        public Map<String, String> visit(BlackboardArtifact artifact) {
+            Map<String, String> params = new HashMap<>();
+            params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
+            try {
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId()));
+            } catch (TskCoreException ex) {
+                logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
+            }
+            return params;
+        }
    }

    /**
-     * Indexing method that bypasses Tika, assumes pure text It reads and
-     * converts the entire content stream to string, assuming UTF8 since we
-     * can't use streaming approach for Solr /update handler. This should be
-     * safe, since all content is now in max 1MB chunks.
+     * Use the given TextExtractor to extract text from the given source. The
+     * text will be chunked and each chunk passed to Solr to add to the index.
+     *
+     *
+     * @param <A>       The type of the Appendix provider that provides
+     *                  additional text to append to the final chunk.
+     * @param <T>       A subclass of SleuthkitVisibleItem.
+     * @param extractor The TextExtractor that will be used to extract text from
+     *                  the given source.
+     * @param source    The source from which text will be extracted, chunked,
+     *                  and indexed.
+     * @param context   The ingest job context that can be used to cancel this
+     *                  process.
+     *
+     * @return True if this method executed normally. or False if there was an
+     *         unexpected exception. //JMTODO: This policy needs to be reviewed.
+     *
+     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+     */
+    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
+        final long sourceID = extractor.getID(source);
+        final String sourceName = extractor.getName(source);
+
+        int numChunks = 0; //unknown until chunking is done
+
+        if (extractor.isDisabled()) {
+            /* some Extrctors, notable the strings extractor, have options which
+             * can be configured such that no extraction should be done */
+            return true;
+        }
+
+        Map<String, String> fields = getContentFields(source);
+        //Get a stream and a reader for that stream
+        try (final InputStream stream = extractor.getInputStream(source);
+                Reader reader = extractor.getReader(stream, source);) {
+
+            //we read max 1024 chars at time, this seems to max what some Readers would return
+            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
+
+            boolean eof = false;  //have we read until the end of the file yet
+            while (!eof) {
+                int chunkSizeInChars = 0;  // the size in chars of the chunk (so far)
+                if (context != null && context.fileIngestIsCancelled()) {
+                    return true;
+                }
+                long charsRead = 0;  // number of chars read in the most recent read operation
+                //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
+                while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
+                        && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
+                    chunkSizeInChars += charsRead;
+                }
+
+                if (charsRead == -1) {
+                    //this is the last chunk
+                    eof = true;
+                } else {
+                    chunkSizeInChars += charsRead;
+
+                    //if we haven't reached the end of the file,
+                    //try to read char-by-char until whitespace to not break words
+                    while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
+                            && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
+                            && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
+                        chunkSizeInChars += charsRead;
+                    }
+                    if (charsRead == -1) {
+                        //this is the last chunk
+                        eof = true;
+                    }
+                }
+
+                StringBuilder sb = new StringBuilder(chunkSizeInChars)
+                        .append(textChunkBuf, 0, chunkSizeInChars);
+
+                sanitizeToUTF8(sb);   //replace non UTF8 chars with '^'
+
+                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
+                fields.put(Server.Schema.ID.toString(), chunkId);
+                try {
+                    //pass the chunk to method that adds it to Solr index
+                    indexChunk(sb.toString(), sourceName, fields);
+                    numChunks++;
+                } catch (Ingester.IngesterException ingEx) {
+                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
+
+                    throw ingEx; //need to rethrow to signal error and move on
+                } catch (Exception ex) {
+                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
+                }
+            }
+        } catch (IOException ex) {
+            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
+            return false;
+        } catch (Exception ex) {
+            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
+            return false;
+        } finally {
+            //after all chunks, index just the meta data, including the  numChunks, of the parent file
+            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
+            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
+            indexChunk(null, sourceName, fields);
+        }
+        return true;
+    }
+
+    /**
+     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
+     * caret '^'
+     *
+     * @param sb the StringBuilder to sanitize
+     *
+     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
+     * function?
+     */
+    private static void sanitizeToUTF8(StringBuilder sb) {
+        final int length = sb.length();
+
+        // Sanitize by replacing non-UTF-8 characters with caret '^'
+        for (int i = 0; i < length; i++) {
+            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
+                sb.replace(i, i + 1, "^");
+            }
+        }
+    }
+
+    /**
+     * Add one chunk as to the Solr index as a seperate sold document.
     *
     * TODO see if can use a byte or string streaming way to add content to
     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
     * 4.0.0), see if possible to stream with UpdateRequestHandler
     *
-     * @param cs
+     * @param chunk  The chunk content as a string
     * @param fields
     * @param size
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    void ingest(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
+    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
+            //JMTODO: actually if the we couldn't get the image id it is set to -1,
+            // but does this really mean we don't want to index it?
+
            //skip the file, image id unknown
-            String msg = NbBundle.getMessage(this.getClass(),
-                    "Ingester.ingest.exception.unknownImgId.msg", cs.getName());
+            //JMTODO: does this need to ne internationalized?
+            String msg = NbBundle.getMessage(Ingester.class,
+                    "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
            logger.log(Level.SEVERE, msg);
            throw new IngesterException(msg);
        }

-        final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE];
+        //Make a SolrInputDocument out of the field map
        SolrInputDocument updateDoc = new SolrInputDocument();
-
        for (String key : fields.keySet()) {
            updateDoc.addField(key, fields.get(key));
        }
-
-        //using size here, but we are no longer ingesting entire files
-        //size is normally a chunk size, up to 1MB
-        if (size > 0) {
-            // TODO (RC): Use try with resources, adjust exception messages
-            InputStream is = null;
-            int read = 0;
-            try {
-                is = cs.getStream();
-                read = is.read(docChunkContentBuf);
-            } catch (IOException ex) {
-                throw new IngesterException(
-                        NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg",
-                                cs.getName()));
-            } finally {
-                if (null != is) {
-                    try {
-                        is.close();
-                    } catch (IOException ex) {
-                        logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS
-                    }
-                }
-            }
-
-            if (read != 0) {
-                String s = "";
-                try {
-                    s = new String(docChunkContentBuf, 0, read, ENCODING);
-                    // Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index
-                    char[] chars = null;
-                    for (int i = 0; i < s.length(); i++) {
-                        if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
-                            // only convert string to char[] if there is a non-UTF8 character
-                            if (chars == null) {
-                                chars = s.toCharArray();
-                            }
-                            chars[i] = '^';
-                        }
-                    }
-                    // check if the string was modified (i.e. there was a non-UTF8 character found)
-                    if (chars != null) {
-                        s = new String(chars);
-                    }
-                } catch (UnsupportedEncodingException ex) {
-                    logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS
-                }
-                updateDoc.addField(Server.Schema.CONTENT.toString(), s);
-            } else {
-                updateDoc.addField(Server.Schema.CONTENT.toString(), "");
-            }
-        } else {
-            //no content, such as case when 0th chunk indexed
-            updateDoc.addField(Server.Schema.CONTENT.toString(), "");
-        }
+        //add the content to the SolrInputDocument
+        //JMTODO: can we just add it to the field map before passing that in?
+        updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);

        try {
-            //TODO consider timeout thread, or vary socket timeout based on size of indexed content
+            //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
            solrServer.addDocument(updateDoc);
            uncommitedIngests = true;
        } catch (KeywordSearchModuleException ex) {
+            //JMTODO: does this need to ne internationalized?
            throw new IngesterException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex);
+                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
        }
-
-    }
-
-    /**
-     * return timeout that should be used to index the content
-     *
-     * @param size size of the content
-     *
-     * @return time in seconds to use a timeout
-     */
-    static int getTimeout(long size) {
-        if (size < 1024 * 1024L) //1MB
-        {
-            return 60;
-        } else if (size < 10 * 1024 * 1024L) //10MB
-        {
-            return 1200;
-        } else if (size < 100 * 1024 * 1024L) //100MB
-        {
-            return 3600;
-        } else {
-            return 3 * 3600;
-        }
-
    }

    /**
@ -380,92 +407,6 @@ class Ingester {
        }
    }

-    /**
-     * ContentStream to read() the data from a FsContent object
-     */
-    private static class FscContentStream implements ContentStream {
-
-        private AbstractFile f;
-
-        FscContentStream(AbstractFile f) {
-            this.f = f;
-        }
-
-        @Override
-        public String getName() {
-            return f.getName();
-        }
-
-        @Override
-        public String getSourceInfo() {
-            return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId());
-        }
-
-        @Override
-        public String getContentType() {
-            return null;
-        }
-
-        @Override
-        public Long getSize() {
-            return f.getSize();
-        }
-
-        @Override
-        public InputStream getStream() throws IOException {
-            return new ReadContentInputStream(f);
-        }
-
-        @Override
-        public Reader getReader() throws IOException {
-            throw new UnsupportedOperationException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader"));
-        }
-    }
-
-    /**
-     * ContentStream associated with FsContent, but forced with no content
-     */
-    private static class NullContentStream implements ContentStream {
-
-        AbstractContent aContent;
-
-        NullContentStream(AbstractContent aContent) {
-            this.aContent = aContent;
-        }
-
-        @Override
-        public String getName() {
-            return aContent.getName();
-        }
-
-        @Override
-        public String getSourceInfo() {
-            return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId());
-        }
-
-        @Override
-        public String getContentType() {
-            return null;
-        }
-
-        @Override
-        public Long getSize() {
-            return 0L;
-        }
-
-        @Override
-        public InputStream getStream() throws IOException {
-            return new ByteArrayInputStream(new byte[0]);
-        }
-
-        @Override
-        public Reader getReader() throws IOException {
-            throw new UnsupportedOperationException(
-                    NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
-        }
-    }
-
    /**
     * Indicates that there was an error with the specific ingest operation, but
     * it's still okay to continue ingesting files.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem

    private void reloadScriptsCheckBoxes() {
        boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));

        enableUTF16Checkbox.setSelected(utf16);

        boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
        enableUTF8Checkbox.setSelected(utf8);

        final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
        reloadScriptsCheckBoxes();

        boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));

        enableUTF16Checkbox.setSelected(utf16);

        boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
        enableUTF8Checkbox.setSelected(utf8);
        final boolean extractEnabled = utf16 || utf8;

@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem

    @Override
    public void store() {
-        KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
+        KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
                Boolean.toString(enableUTF8Checkbox.isSelected()));
-        KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
+        KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
                Boolean.toString(enableUTF16Checkbox.isSelected()));

        if (toUpdate != null) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2015 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
    //accessed read-only by searcher thread

    private boolean startedSearching = false;
-    private List<TextExtractor> textExtractors;
+    private List<FileTextExtractor> textExtractors;
    private StringsTextExtractor stringExtractor;
    private final KeywordSearchJobSettings settings;
    private boolean initialized = false;
@ -415,24 +415,24 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         * @throws IngesterException exception thrown if indexing failed
         */
        private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            TextExtractor fileExtract = null;
+            FileTextExtractor extractor = null;

            //go over available text extractors in order, and pick the first one (most specific one)
-            for (TextExtractor fe : textExtractors) {
+            for (FileTextExtractor fe : textExtractors) {
                if (fe.isSupported(aFile, detectedFormat)) {
-                    fileExtract = fe;
+                    extractor = fe;
                    break;
                }
            }

-            if (fileExtract == null) {
+            if (extractor == null) {
                logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
                return false;
            }

            //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
            //divide into chunks and index
-            return fileExtract.index(aFile, context);
+            return Ingester.getDefault().indexText(extractor, aFile, context);
        }

        /**
@ -448,7 +448,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                if (context.fileIngestIsCancelled()) {
                    return true;
                }
-                if (stringExtractor.index(aFile, KeywordSearchIngestModule.this.context)) {
+                if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                    return true;
                } else {
@ -463,26 +463,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            }
        }

-        /**
-         * Check with every extractor if it supports the file with the detected
-         * format
-         *
-         * @param aFile          file to check for
-         * @param detectedFormat mime-type with detected format (such as
-         *                       text/plain) or null if not detected
-         *
-         * @return true if text extraction is supported
-         */
-        private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
-            for (TextExtractor extractor : textExtractors) {
-                if (extractor.isContentTypeSpecific() == true
-                        && extractor.isSupported(aFile, detectedFormat)) {
-                    return true;
-                }
-            }
-            return false;
-        }
-
        /**
         * Adds the file to the index. Detects file type, calls extractors, etc.
         *
@ -512,7 +492,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                    if (context.fileIngestIsCancelled()) {
                        return;
                    }
-                    ingester.ingest(aFile, false); //meta-data only
+                    ingester.indexMetaDataOnly(aFile);
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
                } catch (IngesterException ex) {
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
@ -534,12 +514,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {

            // we skip archive formats that are opened by the archive module. 
            // @@@ We could have a check here to see if the archive module was enabled though...
-            if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
+            if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
                try {
                    if (context.fileIngestIsCancelled()) {
                        return;
                    }
-                    ingester.ingest(aFile, false); //meta-data only
+                    ingester.indexMetaDataOnly(aFile);
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
                } catch (IngesterException ex) {
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
    }

    private void displayEncodings() {
-        String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
-        String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
+        String utf8 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
+        String utf16 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
        ArrayList<String> encodingsList = new ArrayList<>();
        if (utf8 == null || Boolean.parseBoolean(utf8)) {
            encodingsList.add("UTF8");
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
@ -23,7 +23,6 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Level;
-
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.ModuleSettings;
@ -211,14 +210,14 @@ class KeywordSearchSettings {
            KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
        }
        //setting default Extract UTF8
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
            logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
        }
        //setting default Extract UTF16
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
            logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
        }
        //setting default Latin-1 Script
        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2015 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -19,24 +19,16 @@
 package org.sleuthkit.autopsy.keywordsearch;

 import java.io.IOException;
-import java.util.HashMap;
-import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.HttpSolrClient;
-import org.sleuthkit.datamodel.BlackboardArtifact;
-import org.sleuthkit.datamodel.BlackboardAttribute;
-import org.sleuthkit.datamodel.TskCoreException;
-import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
-import org.apache.solr.common.util.ContentStreamBase.StringStream;
-import org.openide.util.lookup.ServiceProvider;
-import org.sleuthkit.autopsy.casemodule.Case;
-import org.sleuthkit.autopsy.datamodel.ContentUtils;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.SleuthkitCase;
-import org.openide.util.NbBundle;
 import java.net.InetAddress;
 import java.util.MissingResourceException;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.openide.util.NbBundle;
+import org.openide.util.lookup.ServiceProvider;
+import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.TskCoreException;

 /**
 * An implementation of the KeywordSearchService interface that uses Solr for
@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
    private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
    private static final int IS_REACHABLE_TIMEOUT_MS = 1000;

+    ArtifactTextExtractor extractor = new ArtifactTextExtractor();
+
    @Override
    public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
        if (artifact == null) {
@ -57,109 +51,14 @@ public class SolrSearchService implements KeywordSearchService {

        // We only support artifact indexing for Autopsy versions that use
        // the negative range for artifact ids.
-        long artifactId = artifact.getArtifactID();
-
-        if (artifactId > 0) {
+        if (artifact.getArtifactID() > 0) {
            return;
        }
-
-        Case currentCase;
-        try {
-            currentCase = Case.getCurrentCase();
-        } catch (IllegalStateException ignore) {
-            // thorown by Case.getCurrentCase() if currentCase is null
-            return;
-        }
-
-        SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
-        if (sleuthkitCase == null) {
-            return;
-        }
-
-        Content dataSource;
-        AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
-        if (abstractFile != null) {
-            dataSource = abstractFile.getDataSource();
-        } else {
-            dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
-        }
-
-        if (dataSource == null) {
-            return;
-        }
-
-        // Concatenate the string values of all attributes into a single 
-        // "content" string to be indexed.
-        StringBuilder artifactContents = new StringBuilder();
-
-        for (BlackboardAttribute attribute : artifact.getAttributes()) {
-            artifactContents.append(attribute.getAttributeType().getDisplayName());
-            artifactContents.append(" : ");
-
-            // This is ugly since it will need to updated any time a new
-            // TSK_DATETIME_* attribute is added. A slightly less ugly 
-            // alternative would be to assume that all date time attributes
-            // will have a name of the form "TSK_DATETIME*" and check
-            // attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
-            // The major problem with that approach is that it would require
-            // a round trip to the database to get the type name string.
-            // We have also discussed modifying BlackboardAttribute.getDisplayString()
-            // to magically format datetime attributes but that is complicated by
-            // the fact that BlackboardAttribute exists in Sleuthkit data model
-            // while the utility to determine the timezone to use is in ContentUtils
-            // in the Autopsy datamodel.
-            if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
-
-                artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
-            } else {
-                artifactContents.append(attribute.getDisplayString());
-            }
-            artifactContents.append(System.lineSeparator());
-        }
-
-        if (artifactContents.length() == 0) {
-            return;
-        }
-
-        // To play by the rules of the existing text markup implementations,
-        // we need to (a) index the artifact contents in a "chunk" and 
-        // (b) create a separate index entry for the base artifact.
-        // We distinguish artifact content from file content by applying a 
-        // mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
-        // First, create an index entry for the base artifact.
-        HashMap<String, String> solrFields = new HashMap<>();
-        String documentId = Long.toString(artifactId);
-
-        solrFields.put(Server.Schema.ID.toString(), documentId);
-
-        // Set the IMAGE_ID field.
-        solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
+        final Ingester ingester = Ingester.getDefault();

        try {
-            Ingester.getDefault().ingest(new StringStream(""), solrFields, 0);
-        } catch (Ingester.IngesterException ex) {
-            throw new TskCoreException(ex.getCause().getMessage(), ex);
-        }
-
-        // Next create the index entry for the document content.
-        // The content gets added to a single chunk. We may need to add chunking
-        // support later.
-        long chunkId = 1;
-
-        documentId += "_" + Long.toString(chunkId);
-        solrFields.replace(Server.Schema.ID.toString(), documentId);
-
-        StringStream contentStream = new StringStream(artifactContents.toString());
-
-        try {
-            Ingester.getDefault().ingest(contentStream, solrFields, contentStream.getSize());
+            ingester.indexMetaDataOnly(artifact);
+            ingester.indexText(extractor, artifact, null);
        } catch (Ingester.IngesterException ex) {
            throw new TskCoreException(ex.getCause().getMessage(), ex);
        }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2014 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -20,155 +20,106 @@ package org.sleuthkit.autopsy.keywordsearch;

 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.Charset;
+import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.TskCoreException;
+import org.sleuthkit.datamodel.TskException;

 /**
- * Takes an AbstractFile, extract strings, converts into chunks (associated with
- * the original source file) up to 1MB then and indexes chunks as text with Solr
+ * Extracts raw strings from AbstractFile content.
 */
-class StringsTextExtractor implements TextExtractor {
+class StringsTextExtractor extends FileTextExtractor {

-    private static Ingester ingester;
-    private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
-    private static final long MAX_STRING_CHUNK_SIZE = 1 * 31 * 1024L;
-    //private static final int BOM_LEN = 3; 
-    private static final int BOM_LEN = 0;  //disabled prepending of BOM
-    private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
-    private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
-    private AbstractFile sourceFile;
-    private int numChunks = 0;
+    /**
+     * Options for this extractor
+     */
+    enum ExtractOptions {
+        EXTRACT_UTF16, ///< extract UTF16 text, true/false
+        EXTRACT_UTF8, ///< extract UTF8 text, true/false
+    };
+    
    private final List<SCRIPT> extractScripts = new ArrayList<>();
    private Map<String, String> extractOptions = new HashMap<>();

-    //disabled prepending of BOM
-    //static {
-    //prepend UTF-8 BOM to start of the buffer
-    //stringChunkBuf[0] = (byte) 0xEF;
-    //stringChunkBuf[1] = (byte) 0xBB;
-    //stringChunkBuf[2] = (byte) 0xBF;
-    //}
    public StringsTextExtractor() {
-        ingester = Ingester.getDefault();
-        extractScripts.add(DEFAULT_SCRIPT);
+        //LATIN_2 is the default script
+        extractScripts.add(SCRIPT.LATIN_2);
    }

-    @Override
-    public boolean setScripts(List<SCRIPT> extractScripts) {
+    /**
+     * Sets the scripts to use for the extraction
+     *
+     * @param extractScripts scripts to use
+     */
+    public void setScripts(List<SCRIPT> extractScripts) {
        this.extractScripts.clear();
        this.extractScripts.addAll(extractScripts);
-        return true;
    }

-    @Override
+    /**
+     * Get the currently used scripts for extraction
+     *
+     * @return scripts currently used or null if not supported
+     */
    public List<SCRIPT> getScripts() {
        return new ArrayList<>(extractScripts);
    }

-    @Override
-    public int getNumChunks() {
-        return this.numChunks;
-    }
-
-    @Override
-    public AbstractFile getSourceFile() {
-        return sourceFile;
-    }
-
-    @Override
+    /**
+     * Get current options
+     *
+     * @return currently used, extractor specific options, or null of not
+     *         supported
+     */
    public Map<String, String> getOptions() {
        return extractOptions;
    }

-    @Override
+    /**
+     * Set extractor specific options
+     *
+     * @param options options to use
+     */
    public void setOptions(Map<String, String> options) {
        this.extractOptions = options;
    }

    @Override
-    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
-        this.sourceFile = sourceFile;
-        this.numChunks = 0; //unknown until indexing is done
-        boolean success = false;
+    boolean isDisabled() {
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));

-        final boolean extractUTF8
-                = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        return extractUTF8 == false && extractUTF16 == false;
+    }

-        final boolean extractUTF16
-                = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+    @Override
+    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
+        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
+    }

-        if (extractUTF8 == false && extractUTF16 == false) {
-            //nothing to do
-            return true;
-        }
-
-        InputStream stringStream;
+    @Override
+    InputStream getInputStream(AbstractFile sourceFile) {
        //check which extract stream to use
        if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
-            //optimal for english, english only
-            stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
+            return new EnglishOnlyStream(sourceFile);//optimal for english, english only
        } else {
-            stringStream = new AbstractFileStringIntStream(
-                    sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
+            boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
+            boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
+
+            return new InternationalStream(sourceFile, extractScripts, extractUTF8, extractUTF16);
        }
-
-        try {
-            success = true;
-            //break input stream into chunks 
-
-            final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
-            long readSize;
-            while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
-                if (context.fileIngestIsCancelled()) {
-                    ingester.ingest(this);
-                    return true;
-                }
-                //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
-                //debug.write(stringChunkBuf, 0, (int)readSize);
-
-                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
-
-                try {
-                    chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
-                    ++this.numChunks;
-                } catch (IngesterException ingEx) {
-                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
-                    throw ingEx; //need to rethrow/return to signal error and move on
-                }
-
-                //debug.close();    
-            }
-
-            //after all chunks, ingest the parent file without content itself, and store numChunks
-            ingester.ingest(this);
-
-        } catch (IOException ex) {
-            logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
-            success = false;
-        } finally {
-            try {
-                stringStream.close();
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
-            }
-        }
-
-        return success;
    }

    @Override
    public boolean isContentTypeSpecific() {
-        return true;
+        return false;
    }

    @Override
@ -176,4 +127,379 @@ class StringsTextExtractor implements TextExtractor {
        // strings can be run on anything. 
        return true;
    }
+
+    /**
+     * AbstractFile input string stream reader/converter - given AbstractFile,
+     * extract strings from it and return encoded bytes via read()
+     *
+     * Note: the utility supports extraction of only LATIN script and UTF8,
+     * UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
+     * it's fast but could apply multiple encodings on the same string.
+     *
+     * For other script/languages support and better encoding detection use
+     * AbstractFileStringIntStream streaming class, which wraps around
+     * StringExtract extractor.
+     */
+    private static class EnglishOnlyStream extends InputStream {
+
+        private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
+        private static final String NLS = Character.toString((char) 10); //new line
+        private static final int READ_BUF_SIZE = 256;
+        private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
+
+        //args
+        private final AbstractFile content;
+
+        //internal working data
+        private long contentOffset = 0; //offset in fscontent read into curReadBuf
+        private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
+        private int bytesInReadBuf = 0;
+        private int readBufOffset = 0; //offset in read buf processed
+        private StringBuilder curString = new StringBuilder();
+        private int curStringLen = 0;
+        private StringBuilder tempString = new StringBuilder();
+        private int tempStringLen = 0;
+        private boolean isEOF = false;
+        private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
+        private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
+        private boolean inString = false; //if current temp has min chars required
+        private final byte[] oneCharBuf = new byte[1];
+
+        /**
+         * Construct new string stream from FsContent Do not attempt to fill
+         * entire read buffer if that would break a string
+         *
+         * @param content       to extract strings from
+         * @param outputCharset target charset to encode into bytes and index
+         *                      as, e.g. UTF-8
+         *
+         */
+        private EnglishOnlyStream(AbstractFile content) {
+            this.content = content;
+        }
+
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            if (b == null) {
+                throw new NullPointerException();
+            } else if (off < 0 || len < 0 || len > b.length - off) {
+                throw new IndexOutOfBoundsException();
+            } else if (len == 0) {
+                return 0;
+            }
+            long fileSize = content.getSize();
+            if (fileSize == 0) {
+                return -1;
+            }
+            if (isEOF) {
+                return -1;
+            }
+            if (stringAtTempBoundary) {
+                //append entire temp string residual from previous read()
+                //because qualified string was broken down into 2 parts
+                appendResetTemp();
+                stringAtTempBoundary = false;
+                //there could be more to this string in fscontent/buffer
+            }
+            boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
+            int newCurLen = curStringLen + tempStringLen;
+            while (newCurLen < len) {
+                //need to extract more strings
+                if (readBufOffset > bytesInReadBuf - 1) {
+                    //no more bytes to process into strings, read them
+                    try {
+                        bytesInReadBuf = 0;
+                        bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
+                    } catch (TskException ex) {
+                        if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
+                            appendResetTemp();
+                            //have some extracted string, return that, and fail next time
+                            isEOF = true;
+                            int copied = copyToReturn(b, off, len);
+                            return copied;
+                        } else {
+                            return -1; //EOF
+                        }
+                    }
+                    if (bytesInReadBuf < 1) {
+                        if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
+                            appendResetTemp();
+                            //have some extracted string, return that, and fail next time
+                            isEOF = true;
+                            int copied = copyToReturn(b, off, len);
+                            return copied;
+                        } else {
+                            return -1; //EOF
+                        }
+                    }
+                    //increment content offset for next read
+                    contentOffset += bytesInReadBuf;
+                    //reset read buf position
+                    readBufOffset = 0;
+                }
+                //get char from cur read buf
+                char c = (char) curReadBuf[readBufOffset++];
+                if (c == 0 && singleConsecZero == false) {
+                    //preserve the current sequence if max consec. 1 zero char
+                    singleConsecZero = true;
+                } else {
+                    singleConsecZero = false;
+                }
+                if (StringExtract.isPrintableAscii(c)) {
+                    tempString.append(c);
+                    ++tempStringLen;
+                    if (tempStringLen >= MIN_PRINTABLE_CHARS) {
+                        inString = true;
+                    }
+                    //boundary case when temp has still chars - handled after the loop
+                } else if (!singleConsecZero) {
+                    //break the string, clear temp
+                    if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
+                        //append entire temp string with new line
+                        tempString.append(NLS);
+                        ++tempStringLen;
+                        curString.append(tempString);
+                        curStringLen += tempStringLen;
+                        stringAtBufBoundary = false;
+                    }
+                    //reset temp
+                    tempString = new StringBuilder();
+                    tempStringLen = 0;
+                }
+                newCurLen = curStringLen + tempStringLen;
+            }
+            //check if still in string state, so that next chars in read buf bypass min chars check
+            //and qualify as string even if less < min chars required
+            if (inString) {
+                inString = false; //reset
+                stringAtBufBoundary = true; //will bypass the check
+            }
+            //check if temp still has chars to qualify as a string
+            //we might need to break up temp into 2 parts for next read() call
+            //consume as many as possible to fill entire user buffer
+            if (tempStringLen >= MIN_PRINTABLE_CHARS) {
+                if (newCurLen > len) {
+                    int appendChars = len - curStringLen;
+                    //save part for next user read(), need to break up temp string
+                    //do not append new line
+                    String toAppend = tempString.substring(0, appendChars);
+                    String newTemp = tempString.substring(appendChars);
+                    curString.append(toAppend);
+                    curStringLen += appendChars;
+                    tempString = new StringBuilder(newTemp);
+                    tempStringLen = newTemp.length();
+                    stringAtTempBoundary = true;
+                } else {
+                    //append entire temp
+                    curString.append(tempString);
+                    curStringLen += tempStringLen;
+                    //reset temp
+                    tempString = new StringBuilder();
+                    tempStringLen = 0;
+                }
+            } else {
+                //if temp has a few chars, not qualified as string for now,
+                //will be processed during next read() call
+            }
+            //copy current strings to user
+            final int copied = copyToReturn(b, off, len);
+            //there may be still chars in read buffer or  tempString, for next read()
+            return copied;
+        }
+
+        //append temp buffer to cur string buffer and reset temp, if enough chars
+        //does not append new line
+        private void appendResetTemp() {
+            if (tempStringLen >= MIN_PRINTABLE_CHARS) {
+                curString.append(tempString);
+                curStringLen += tempStringLen;
+                tempString = new StringBuilder();
+                tempStringLen = 0;
+            }
+        }
+
+        //copy currently extracted string to user buffer
+        //and reset for next read() call
+        private int copyToReturn(byte[] b, int off, long len) {
+            final String curStringS = curString.toString();
+            //logger.log(Level.INFO, curStringS);
+            byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+            System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
+            //logger.log(Level.INFO, curStringS);
+            //copied all string, reset
+            curString = new StringBuilder();
+            int ret = curStringLen;
+            curStringLen = 0;
+            return ret;
+        }
+
+        @Override
+        public int read() throws IOException {
+            final int read = read(oneCharBuf, 0, 1);
+            if (read == 1) {
+                return oneCharBuf[0];
+            } else {
+                return -1;
+            }
+        }
+
+        @Override
+        public int available() throws IOException {
+            //we don't know how many bytes in curReadBuf may end up as strings
+            return 0;
+        }
+
+        @Override
+        public long skip(long n) throws IOException {
+            //use default implementation that reads into skip buffer
+            //but it could be more efficient
+            return super.skip(n);
+        }
+    }
+
+    /**
+     * Wrapper over StringExtract to provide streaming API Given AbstractFile
+     * object, extract international strings from the file and read output as a
+     * stream of UTF-8 strings as encoded bytes.
+     *
+     */
+    private static class InternationalStream extends InputStream {
+
+        private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
+        private static final int FILE_BUF_SIZE = 1024 * 1024;
+        private final AbstractFile content;
+        private final byte[] oneCharBuf = new byte[1];
+        private final StringExtract stringExtractor;
+        /** true if there is nothing to do because neither extractUTF8 nor
+         * extractUTF16 was true in constructor */
+        private final boolean nothingToDo;
+        private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
+        private long fileReadOffset = 0L;
+        private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
+        private int convertBuffOffset = 0; //offset to start returning data to user on next read()
+        private int bytesInConvertBuff = 0; //amount of data currently in the buffer
+        private boolean fileEOF = false; //if file has more bytes to read
+        private StringExtract.StringExtractResult lastExtractResult;
+
+        /**
+         * Constructs new stream object that does conversion from file, to
+         * extracted strings, then to byte stream, for specified script,
+         * auto-detected encoding (UTF8, UTF16LE, UTF16BE), and specified output
+         * byte stream encoding
+         *
+         * @param content      input content to process and turn into a stream
+         *                     to convert into strings
+         * @param scripts      a list of scripts to consider
+         * @param extractUTF8  whether to extract utf8 encoding
+         * @param extractUTF16 whether to extract utf16 encoding
+         */
+        private InternationalStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
+            this.content = content;
+            this.stringExtractor = new StringExtract();
+            this.stringExtractor.setEnabledScripts(scripts);
+            this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
+            this.stringExtractor.setEnableUTF8(extractUTF8);
+            this.stringExtractor.setEnableUTF16(extractUTF16);
+        }
+
+        @Override
+        public int read() throws IOException {
+            if (nothingToDo) {
+                return -1;
+            }
+            final int read = read(oneCharBuf, 0, 1);
+            if (read == 1) {
+                return oneCharBuf[0];
+            } else {
+                return -1;
+            }
+        }
+
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            if (b == null) {
+                throw new NullPointerException();
+            } else if (off < 0 || len < 0 || len > b.length - off) {
+                throw new IndexOutOfBoundsException();
+            } else if (len == 0) {
+                return 0;
+            }
+            if (nothingToDo) {
+                return -1;
+            }
+            long fileSize = content.getSize();
+            if (fileSize == 0) {
+                return -1;
+            }
+            //read and convert until user buffer full
+            //we have data if file can be read or when byteBuff has converted strings to return
+            int bytesToUser = 0; //returned to user so far
+            int offsetUser = off;
+            while (bytesToUser < len && offsetUser < len) {
+                //check if we have enough converted strings
+                int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
+                if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
+                    try {
+                        //convert more strings, store in buffer
+                        long toRead = 0;
+
+                        //fill up entire fileReadBuff fresh
+                        toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
+                        //}
+                        int read = content.read(fileReadBuff, fileReadOffset, toRead);
+                        if (read == -1 || read == 0) {
+                            fileEOF = true;
+                        } else {
+                            fileReadOffset += read;
+                            if (fileReadOffset >= fileSize) {
+                                fileEOF = true;
+                            }
+                            //put converted string in convertBuff
+                            convert(read);
+                            convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
+                        }
+                    } catch (TskCoreException ex) {
+                        //Exceptions.printStackTrace(ex);
+                        fileEOF = true;
+                    }
+                }
+                //nothing more to read, and no more bytes in convertBuff
+                if (convertBuff == null || convertBuffRemain == 0) {
+                    if (fileEOF) {
+                        return bytesToUser > 0 ? bytesToUser : -1;
+                    } else {
+                        //no strings extracted, try another read
+                        continue;
+                    }
+                }
+                //return part or all of convert buff to user
+                final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
+                System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
+
+                convertBuffOffset += toCopy;
+                offsetUser += toCopy;
+                bytesToUser += toCopy;
+            }
+            //if more string data in convertBuff, will be consumed on next read()
+            return bytesToUser;
+        }
+
+        /**
+         * convert bytes in file buffer to string, and encode string in
+         * convertBuffer
+         *
+         * @param numBytes num bytes in the fileReadBuff
+         */
+        private void convert(int numBytes) {
+            lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
+            convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+            //reset tracking vars
+            if (lastExtractResult.getNumBytes() == 0) {
+                bytesInConvertBuff = 0;
+            } else {
+                bytesInConvertBuff = convertBuff.length;
+            }
+            convertBuffOffset = 0;
+        }
+    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2012 Basis Technology Corp.
+ * Copyright 2011-16 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -18,140 +18,76 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
-import org.sleuthkit.datamodel.AbstractFile;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.logging.Level;
+import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.datamodel.SleuthkitVisitableItem;

 /**
- * Common methods for utilities that extract text and content and divide into
- * chunks
+ * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
+ * This Reader is given to the Ingester to chunk and index in Solr.
+ *
+ * @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
+ *                     is able to process.
 */
-interface TextExtractor {
+abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
+
+    static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());

    /**
-     * Common options that can be used by some extractors
+     * Is this extractor configured such that no extraction will/should be done?
+     *
+     * @return True if this extractor will/should not perform any extraction.
     */
-    enum ExtractOptions {
-
-        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-    };
-
-    //generally text extractors should ignore archives
-    //and let unpacking modules take case of them
-    static final List<String> ARCHIVE_MIME_TYPES
-            = Arrays.asList(
-                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
-                    "application/x-7z-compressed", //NON-NLS
-                    "application/x-ace-compressed", //NON-NLS
-                    "application/x-alz-compressed", //NON-NLS
-                    "application/x-arj", //NON-NLS
-                    "application/vnd.ms-cab-compressed", //NON-NLS
-                    "application/x-cfs-compressed", //NON-NLS
-                    "application/x-dgc-compressed", //NON-NLS
-                    "application/x-apple-diskimage", //NON-NLS
-                    "application/x-gca-compressed", //NON-NLS
-                    "application/x-dar", //NON-NLS
-                    "application/x-lzx", //NON-NLS
-                    "application/x-lzh", //NON-NLS
-                    "application/x-rar-compressed", //NON-NLS
-                    "application/x-stuffit", //NON-NLS
-                    "application/x-stuffitx", //NON-NLS
-                    "application/x-gtar", //NON-NLS
-                    "application/x-archive", //NON-NLS
-                    "application/x-executable", //NON-NLS
-                    "application/x-gzip", //NON-NLS
-                    "application/zip", //NON-NLS
-                    "application/x-zoo", //NON-NLS
-                    "application/x-cpio", //NON-NLS
-                    "application/x-shar", //NON-NLS
-                    "application/x-tar", //NON-NLS
-                    "application/x-bzip", //NON-NLS
-                    "application/x-bzip2", //NON-NLS
-                    "application/x-lzip", //NON-NLS
-                    "application/x-lzma", //NON-NLS
-                    "application/x-lzop", //NON-NLS
-                    "application/x-z", //NON-NLS
-                    "application/x-compress"); //NON-NLS
+    abstract boolean isDisabled();

    /**
-     * Get number of chunks resulted from extracting this AbstractFile
+     * Log the given message and exception as a warning.
     *
-     * @return the number of chunks produced
+     * @param msg
+     * @param ex
     */
-    int getNumChunks();
+    void logWarning(String msg, Exception ex) {
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
+    }

    /**
-     * Get the source file associated with this extraction
+     * Get an input stream over the content of the given source.
     *
-     * @return the source AbstractFile
-     */
-    AbstractFile getSourceFile();
-
-    /**
-     * Index the Abstract File
-     *
-     * @param sourceFile file to index
-     *
-     * @return true if indexed successfully, false otherwise
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
-     */
-    boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;
-
-    /**
-     * Sets the scripts to use for the extraction
-     *
-     * @param extractScripts scripts to use
-     *
-     * @return true if extractor supports script - specific extraction, false
-     *         otherwise
-     */
-    boolean setScripts(List<SCRIPT> extractScript);
-
-    /**
-     * Get the currently used scripts for extraction
-     *
-     * @return scripts currently used or null if not supported
-     */
-    List<SCRIPT> getScripts();
-
-    /**
-     * Get current options
-     *
-     * @return currently used, extractor specific options, or null of not
-     *         supported
-     */
-    Map<String, String> getOptions();
-
-    /**
-     * Set extractor specific options
-     *
-     * @param options options to use
-     */
-    void setOptions(Map<String, String> options);
-
-    /**
-     * Determines if the extractor works only for specified types is
-     * supportedTypes() or whether is a generic content extractor (such as
-     * string extractor)
+     * @param source
     *
     * @return
     */
-    boolean isContentTypeSpecific();
+    abstract InputStream getInputStream(TextSource source);

    /**
-     * Determines if the file content is supported by the extractor if
-     * isContentTypeSpecific() returns true.
+     * Get a reader that over the text extracted from the given source.
     *
-     * @param file           to test if its content should be supported
-     * @param detectedFormat mime-type with detected format (such as text/plain)
-     *                       or null if not detected
+     * @param stream
+     * @param source
     *
-     * @return true if the file content is supported, false otherwise
+     * @return
+     *
+     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    boolean isSupported(AbstractFile file, String detectedFormat);
+    abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
+
+    /**
+     * Get the 'object' id of the given source.
+     *
+     * @param source
+     *
+     * @return
+     */
+    abstract long getID(TextSource source);
+
+    /**
+     * Get a human readable name for the given source.
+     *
+     * @param source
+     *
+     * @return
+     */
+    abstract String getName(TextSource source);
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2012-2013 Basis Technology Corp.
+ * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -18,244 +18,85 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

+import com.google.common.io.CharSource;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
 import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.MissingResourceException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
-import org.sleuthkit.autopsy.coreutils.TextUtil;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.coreutils.StringExtract;
-import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;

 /**
- * Extractor of text from TIKA supported AbstractFile content. Extracted text is
- * divided into chunks and indexed with Solr. Protects against Tika parser hangs
- * (for unexpected/corrupt content) using a timeout mechanism. If Tika
- * extraction succeeds, chunks are indexed with Solr.
- *
- * This Tika extraction/chunking utility is useful for large files of Tika
- * parsers-supported content type.
- *
+ * Extracts text from Tika supported AbstractFile content. Protects against Tika
+ * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
 */
-class TikaTextExtractor implements TextExtractor {
+class TikaTextExtractor extends FileTextExtractor {

-    private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
-    private static Ingester ingester;
-    private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
-    private static final int MAX_EXTR_TEXT_CHARS = 16 * 1024;
-    private static final int SINGLE_READ_CHARS = 1024;
-    private static final int EXTRA_CHARS = 128; //for whitespace
-    private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
-    private AbstractFile sourceFile; //currently processed file
-    private int numChunks = 0;
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
-    private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();

-    TikaTextExtractor() {
-        ingester = Ingester.getDefault();
+    private static final List<String> TIKA_SUPPORTED_TYPES
+            = new Tika().getParser().getSupportedTypes(new ParseContext())
+            .parallelStream()
+            .map(mt -> mt.getType() + "/" + mt.getSubtype())
+            .collect(Collectors.toList());

-        Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
-        for (MediaType mt : mediaTypes) {
-            TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
-        }
-        //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
+    @Override
+    void logWarning(final String msg, Exception ex) {
+        KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
+        super.logWarning(msg, ex);
    }

    @Override
-    public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
-        return false;
-    }
-
-    @Override
-    public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
-        return null;
-    }
-
-    @Override
-    public Map<String, String> getOptions() {
-        return null;
-    }
-
-    @Override
-    public void setOptions(Map<String, String> options) {
-    }
-
-    @Override
-    public int getNumChunks() {
-        return numChunks;
-    }
-
-    @Override
-    public AbstractFile getSourceFile() {
-        return sourceFile;
-    }
-
-    @Override
-    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
-        this.sourceFile = sourceFile;
-        numChunks = 0; //unknown until indexing is done
-
-        boolean success = false;
-        Reader reader = null;
-        final InputStream stream = new ReadContentInputStream(sourceFile);
+    Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+        Metadata metadata = new Metadata();
+        //Parse the file in a task, a convenient way to have a timeout...
+        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
        try {
-            Metadata meta = new Metadata();
-
-            //Parse the file in a task
-            Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
-            ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
-            final Future<?> future = tikaParseExecutor.submit(parseTask);
-            try {
-                future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
-            } catch (TimeoutException te) {
-                final String msg = NbBundle.getMessage(this.getClass(),
-                        "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
-                        sourceFile.getId(), sourceFile.getName());
-                KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
-                logger.log(Level.WARNING, msg);
-                throw new IngesterException(msg);
-            } catch (Exception ex) {
-                final String msg = NbBundle.getMessage(this.getClass(),
-                        "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
-                        sourceFile.getId(), sourceFile.getName());
-                KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-                logger.log(Level.WARNING, msg);
-                throw new IngesterException(msg);
-            }
-
-            // get the reader with the results
-            reader = parseTask.getReader();
-            if (reader == null) {
-                //likely due to exception in parse()
-                logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
-                return false;
-            }
-
-            // break the results into chunks and index
-            success = true;
-            long readSize;
-            long totalRead = 0;
-            boolean eof = false;
-            //we read max 1024 chars at time, this seems to max what this Reader would return
-            while (!eof) {
-                if (context.fileIngestIsCancelled()) {
-                    ingester.ingest(this);
-                    return true;
-                }
-                readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
-                if (readSize == -1) {
-                    eof = true;
-                } else {
-                    totalRead += readSize;
-                }
-                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
-                while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
-                        && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
-                    totalRead += readSize;
-                }
-                if (readSize == -1) {
-                    //this is the last chunk
-                    eof = true;
-                } else {
-                    //try to read char-by-char until whitespace to not break words
-                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
-                            && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
-                            && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
-                        totalRead += readSize;
-                    }
-                    if (readSize == -1) {
-                        //this is the last chunk
-                        eof = true;
-                    }
-                }
-
-                // Sanitize by replacing non-UTF-8 characters with caret '^'
-                for (int i = 0; i < totalRead; ++i) {
-                    if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
-                        textChunkBuf[i] = '^';
-                    }
-                }
-
-                StringBuilder sb = new StringBuilder((int) totalRead + 1000);
-                sb.append(textChunkBuf, 0, (int) totalRead);
-
-                //reset for next chunk
-                totalRead = 0;
-
-                //append meta data if last chunk
-                if (eof) {
-                    //sort meta data keys
-                    List<String> sortedKeyList = Arrays.asList(meta.names());
-                    Collections.sort(sortedKeyList);
-                    sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
-                    for (String key : sortedKeyList) {
-                        String value = meta.get(key);
-                        sb.append(key).append(": ").append(value).append("\n");
-                    }
-                }
-
-                // Encode from UTF-8 charset to bytes
-                byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
-                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
-                try {
-                    chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
-                    ++this.numChunks;
-                } catch (Ingester.IngesterException ingEx) {
-                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
-                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
-                    throw ingEx; //need to rethrow/return to signal error and move on
-                }
-            }
-        } catch (IOException ex) {
-            final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
-            KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-            logger.log(Level.WARNING, msg);
-            success = false;
+            final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+            CharSource metaDataCharSource = getMetaDataCharSource(metadata);
+            //concatenate parsed content and meta data into a single reader.
+            return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
+        } catch (TimeoutException te) {
+            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
+            logWarning(msg, te);
+            throw new IngesterException(msg);
        } catch (Exception ex) {
-            final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
-            KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-            logger.log(Level.WARNING, msg);
-            success = false;
-        } finally {
-            try {
-                stream.close();
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
-            }
-            try {
-                if (reader != null) {
-                    reader.close();
-                }
-            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
-            }
+            KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
+            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
+            logWarning(msg, ex);
+            throw new IngesterException(msg, ex);
        }
+    }

-        //after all chunks, ingest the parent file without content itself, and store numChunks
-        ingester.ingest(this);
-
-        return success;
+    /**
+     * Gets a CharSource that wraps a formated representation of the given
+     * Metadata.
+     *
+     * @param metadata The Metadata to wrap as a CharSource
+     *
+     * @return A CharSource for the given MetaData
+     */
+    static private CharSource getMetaDataCharSource(Metadata metadata) {
+        return CharSource.wrap(
+                new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
+                .append(Stream.of(metadata.names()).sorted()
+                        .map(key -> key + ": " + metadata.get(key))
+                        .collect(Collectors.joining("\n"))
+                ));
    }

    @Override
@ -265,67 +106,64 @@ class TikaTextExtractor implements TextExtractor {

    @Override
    public boolean isSupported(AbstractFile file, String detectedFormat) {
-        if (detectedFormat == null) {
-            return false;
-        } else if (detectedFormat.equals("application/octet-stream") //NON-NLS
-                || detectedFormat.equals("application/x-msdownload")) { //NON-NLS
-            //any binary unstructured blobs (string extraction will be used)
-            return false;
-        } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
-            return false;
-        } //skip video other than flv (tika supports flv only)
-        else if (detectedFormat.contains("video/") //NON-NLS
-                && !detectedFormat.equals("video/x-flv")) { //NON-NLS
-            return false;
-        } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
-            // Tika currently has a bug in the ttf parser in fontbox.
-            // It will throw an out of memory exception
+        if (detectedFormat == null
+                || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
+                || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
+                || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
+                || detectedFormat.equals("application/x-font-ttf")) {   // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
+
            return false;
        }
-
-        //TODO might need to add more mime-types to ignore
-        //then accept all formats supported by Tika
        return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
+    }
+
+    @Override
+    InputStream getInputStream(AbstractFile sourceFile1) {
+        return new ReadContentInputStream(sourceFile1);
+    }
+
+    @Override
+    boolean isDisabled() {
+        return false;
+    }
+
+    /**
+     * Return timeout that should be used to index the content.
+     *
+     * @param size size of the content
+     *
+     * @return time in seconds to use a timeout
+     */
+    private static int getTimeout(long size) {
+        if (size < 1024 * 1024L) //1MB
+        {
+            return 60;
+        } else if (size < 10 * 1024 * 1024L) //10MB
+        {
+            return 1200;
+        } else if (size < 100 * 1024 * 1024L) //100MB
+        {
+            return 3600;
+        } else {
+            return 3 * 3600;
+        }

    }

    /**
-     * Runnable task that calls tika to parse the content using the input
-     * stream. Provides reader for results.
+     * An implementation of CharSource that just wraps an existing reader and
+     * returns it in openStream().
     */
-    private static class ParseRequestTask implements Runnable {
+    private static class ReaderCharSource extends CharSource {

-        //in
-        private Tika tika;
-        private InputStream stream;
-        private Metadata meta;
-        private AbstractFile sourceFile;
-        //out
-        private Reader reader;
+        private final Reader reader;

-        ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
-            this.tika = tika;
-            this.stream = stream;
-            this.meta = meta;
-            this.sourceFile = sourceFile;
+        public ReaderCharSource(Reader reader) {
+            this.reader = reader;
        }

        @Override
-        public void run() {
-            try {
-                reader = tika.parse(stream, meta);
-            } catch (IOException ex) {
-                KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-                tika = null;
-                reader = null;
-            } catch (Exception ex) {
-                KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
-                tika = null;
-                reader = null;
-            }
-        }
-
-        public Reader getReader() {
+        public Reader openStream() throws IOException {
            return reader;
        }
    }
--- a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java
+++ b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java
@ -35,7 +35,6 @@ import java.util.logging.Level;
 import java.util.logging.Logger;
 import javax.imageio.ImageIO;
 import javax.swing.JDialog;
-import javax.swing.JLabel;
 import javax.swing.JTextField;
 import junit.framework.Test;
 import junit.framework.TestCase;
@ -50,10 +49,10 @@ import org.netbeans.jemmy.operators.JComboBoxOperator;
 import org.netbeans.jemmy.operators.JDialogOperator;
 import org.netbeans.jemmy.operators.JFileChooserOperator;
 import org.netbeans.jemmy.operators.JLabelOperator;
+import org.netbeans.jemmy.operators.JListOperator;
 import org.netbeans.jemmy.operators.JTabbedPaneOperator;
 import org.netbeans.jemmy.operators.JTableOperator;
 import org.netbeans.jemmy.operators.JTextFieldOperator;
-import org.netbeans.jemmy.operators.JListOperator;
 import org.netbeans.junit.NbModuleSuite;
 import org.sleuthkit.autopsy.ingest.IngestManager;

@ -186,6 +185,8 @@ public class RegressionTest extends TestCase {
        String img_path = getEscapedPath(System.getProperty("img_path"));
        String imageDir = img_path;
        ((JTextField) jtfo0.getSource()).setText(imageDir);
+        JComboBoxOperator comboBoxOperator = new JComboBoxOperator(wo, 1);
+        comboBoxOperator.setSelectedItem("(GMT-5:00) America/New_York");
        wo.btNext().clickMouse();
    }