Complete overhaul of how extractors are currently implemented, hopefully converging to a more sensible solution

2025-07-11 23:46:15 +00:00 · 2018-12-07 13:26:52 -05:00 · 2018-12-07 13:26:52 -05:00 · ece50a3a00
commit ece50a3a00
parent ce548fb978
12 changed files with 241 additions and 530 deletions
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@ -22,7 +22,6 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
-import java.util.logging.Level;
 import org.apache.commons.io.IOUtils;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
@ -35,39 +34,27 @@ import org.sleuthkit.datamodel.TskCoreException;
 * Extracts text from artifacts by concatenating the values of all of the
 * artifact's attributes.
 */
-class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {

    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());

-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
-
-    private InputStream getInputStream(Content artifact) throws TextExtractorException {
-                        BlackboardArtifact art = (BlackboardArtifact)artifact;
-
+    private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException {
        // Concatenate the string values of all attributes into a single
        // "content" string to be indexed.
        StringBuilder artifactContents = new StringBuilder();

        Content dataSource = null;
        try {
-            dataSource = art.getDataSource();
+            dataSource = artifact.getDataSource();
        } catch (TskCoreException tskCoreException) {
-            throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
+            throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
        }
        if (dataSource == null) {
-            throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
+            throw new InitReaderException("Datasource was null for artifact: " + artifact.toString());
        }

        try {
-            for (BlackboardAttribute attribute : art.getAttributes()) {
+            for (BlackboardAttribute attribute : artifact.getAttributes()) {
                artifactContents.append(attribute.getAttributeType().getDisplayName());
                artifactContents.append(" : ");
                // We have also discussed modifying BlackboardAttribute.getDisplayString()
@ -85,40 +72,31 @@ class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
                artifactContents.append(System.lineSeparator());
            }
        } catch (TskCoreException tskCoreException) {
-            throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
+            throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
        }

        return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
    }

    @Override
-    public Reader getReader(Content source) throws TextExtractorException {
+    public Reader getReader(BlackboardArtifact source) throws InitReaderException {
        return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
    }

-    @Override
-    public long getID(Content source) {
-        BlackboardArtifact art = (BlackboardArtifact)source;
-        return art.getArtifactID();
-    }
-
-    @Override
-    public String getName(Content source) {
-                BlackboardArtifact art = (BlackboardArtifact)source;
-        return art.getDisplayName() + "_" + art.getArtifactID();
-    }
-
+    /**
+     * Configures this extractors to the settings stored in relevant config instances.
+     * 
+     * This operation is a no-op since currently there are no configurable settings
+     * of the extraction process.
+     *
+     * @param context Instance containing file config settings
+     */
    @Override
    public void setExtractionSettings(ExtractionContext context) {
    }

    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
-    @Override
-    public boolean isSupported(Content file, String detectedFormat) {
+    public boolean isSupported(BlackboardArtifact file, String detectedFormat) {
        return true;
    }
 }
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
@ -1,130 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-2018 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.textextractors;
-
-import com.google.common.collect.ImmutableList;
-import java.io.Reader;
-import java.util.List;
-import org.sleuthkit.datamodel.Content;
-
-/**
- * Common methods for utilities that extract text and content and divide into
- * chunks
- * @param <T>
- */
-public abstract class ContentTextExtractor<T extends Content> implements TextExtractor<T> {
-    
-    //Mimetype groups to aassist extractor implementations in ignoring binary and 
-    //archive files.
-    public static final List<String> BINARY_MIME_TYPES
-            = ImmutableList.of(
-                    //ignore binary blob data, for which string extraction will be used
-                    "application/octet-stream", //NON-NLS
-                    "application/x-msdownload"); //NON-NLS
-
-    /** generally text extractors should ignore archives and let unpacking
-     * modules take care of them */
-    public static final List<String> ARCHIVE_MIME_TYPES
-            = ImmutableList.of(
-                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
-                    "application/x-7z-compressed", //NON-NLS
-                    "application/x-ace-compressed", //NON-NLS
-                    "application/x-alz-compressed", //NON-NLS
-                    "application/x-arj", //NON-NLS
-                    "application/vnd.ms-cab-compressed", //NON-NLS
-                    "application/x-cfs-compressed", //NON-NLS
-                    "application/x-dgc-compressed", //NON-NLS
-                    "application/x-apple-diskimage", //NON-NLS
-                    "application/x-gca-compressed", //NON-NLS
-                    "application/x-dar", //NON-NLS
-                    "application/x-lzx", //NON-NLS
-                    "application/x-lzh", //NON-NLS
-                    "application/x-rar-compressed", //NON-NLS
-                    "application/x-stuffit", //NON-NLS
-                    "application/x-stuffitx", //NON-NLS
-                    "application/x-gtar", //NON-NLS
-                    "application/x-archive", //NON-NLS
-                    "application/x-executable", //NON-NLS
-                    "application/x-gzip", //NON-NLS
-                    "application/zip", //NON-NLS
-                    "application/x-zoo", //NON-NLS
-                    "application/x-cpio", //NON-NLS
-                    "application/x-shar", //NON-NLS
-                    "application/x-tar", //NON-NLS
-                    "application/x-bzip", //NON-NLS
-                    "application/x-bzip2", //NON-NLS
-                    "application/x-lzip", //NON-NLS
-                    "application/x-lzma", //NON-NLS
-                    "application/x-lzop", //NON-NLS
-                    "application/x-z", //NON-NLS
-                    "application/x-compress"); //NON-NLS
-    
-    /**
-     * Determines if the extractor works only for specified types is
-     * supportedTypes() or whether is a generic content extractor (such as
-     * string extractor)
-     *
-     * @return
-     */
-    public abstract boolean isContentTypeSpecific();
-
-    /**
-     * Determines if the file content is supported by the extractor if
-     * isContentTypeSpecific() returns true.
-     *
-     * @param file           to test if its content should be supported
-     * @param detectedFormat mime-type with detected format (such as text/plain)
-     *                       or null if not detected
-     *
-     * @return true if the file content is supported, false otherwise
-     */
-    public abstract boolean isSupported(T file, String detectedFormat);
-
-    /**
-     * Returns a reader that will iterate over the text of the source content.
-     * 
-     * @param source Content source to read
-     * @return A reader that contains all source text
-     * @throws TextExtractorException Error encountered during extraction
-     */
-    @Override
-    public abstract Reader getReader(T source) throws TextExtractorException;
-
-    /**
-     * Get the object id of the content source.
-     * 
-     * @param source source content
-     * @return object id associated with this source content
-     */
-    @Override
-    public long getID(T source) {
-        return source.getId();
-    }
-
-    /**
-     * Returns the human-readable name of the given content source.
-     * 
-     * @param source source content
-     * @return name of source content
-     */
-    @Override
-    public String getName(T source) {
-        return source.getName();
-    }
-}
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@ -23,7 +23,6 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
-import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Config;
 import net.htmlparser.jericho.LoggerProvider;
@ -33,13 +32,12 @@ import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;

 /**
 * Extracts text from HTML content.
 */
-final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T> {

    static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
    private final int MAX_SIZE;
@ -67,19 +65,6 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
        MAX_SIZE = 50_000_000;
    }

-    /**
-     * Determines if this extractor is responsible for extracting only a
-     * specific type of media.
-     *
-     * In this case, only HTML documents can be read successfully.
-     *
-     * @return true
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
    /**
     * Determines if this content type is supported by this extractor.
     *
@ -89,7 +74,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
     * @return flag indicating support
     */
    @Override
-    public boolean isSupported(Content content, String detectedFormat) {
+    public boolean isSupported(AbstractFile content, String detectedFormat) {
        return detectedFormat != null
                && WEB_MIME_TYPES.contains(detectedFormat)
                && content.getSize() <= MAX_SIZE;
@ -105,7 +90,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
     * @throws TextExtractorException
     */
    @Override
-    public Reader getReader(Content content) throws TextExtractorException {
+    public Reader getReader(AbstractFile content) throws InitReaderException {
        //TODO JIRA-4467, there is only harm in excluding HTML documents greater
        //than 50MB due to our troubled approach of extraction.
        ReadContentInputStream stream = new ReadContentInputStream(content);
@ -201,25 +186,10 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
            // All done, now make it a reader
            return new StringReader(stringBuilder.toString());
        } catch (IOException ex) {
-            throw new TextExtractorException("Error extracting HTML from content.", ex);
+            throw new InitReaderException("Error extracting HTML from content.", ex);
        }
    }

-    /**
-     * Indicates if this extractor can run.
-     *
-     * @return Flag indicating if this extractor can run.
-     */
-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
-
    /**
     * Determines how the extraction process will proceed given the settings 
     * stored in this context instance.
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;

 /**
 * Extracts text from SQLite database files.
@ -39,48 +38,10 @@ import org.sleuthkit.datamodel.Content;
 *  2) Tables that contain spaces in their name are not extracted
 *  3) Table names are not included in its output text
 */
-final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor<T> {

    private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
    private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
-    private static boolean isDisabled;
-    
-    static {
-        try {
-            Class.forName("org.sqlite.JDBC");
-            isDisabled = false;
-        } catch (ClassNotFoundException ex) {
-            logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
-                + "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
-            isDisabled = true;
-        }
-    }
-
-    /**
-     * This extractor only works for sqlite files, so it is indeed content type
-     * specific. 
-     * 
-     * @return true
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
-    /**
-     * Determines if this extractor is fit to run.
-     * 
-     * @return Flag indicating if it should or shouldn't be run.
-     */
-    @Override
-    public boolean isDisabled() {
-        return isDisabled;
-    }
-
-    @Override
-    public void logWarning(String msg, Exception exception) {
-        logger.log(Level.WARNING, msg, exception); //NON-NLS
-    }

    /**
     * Supports only the sqlite mimetypes
@ -91,7 +52,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
     * @return true if x-sqlite3
     */
    @Override
-    public boolean isSupported(Content file, String detectedFormat) {
+    public boolean isSupported(AbstractFile file, String detectedFormat) {
        return SQLITE_MIMETYPE.equals(detectedFormat);
    }

@ -105,12 +66,8 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
     * @throws TextExtractorException
     */
    @Override
-    public Reader getReader(Content source) throws TextExtractorException {
-        if(source instanceof AbstractFile) {
-            return new SQLiteStreamReader((AbstractFile)source);
-        }
-        throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type"
-                + " AbstractFile.", source.getName(), source.getId()));
+    public Reader getReader(AbstractFile source) throws InitReaderException {
+        return new SQLiteStreamReader(source);
    }

    /**
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@ -25,7 +25,6 @@ import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
-import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@ -37,41 +36,12 @@ import org.sleuthkit.datamodel.TskException;
 /**
 * Extracts raw strings from content.
 */
-final class StringsTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class StringsTextExtractor {

-    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
    private boolean extractUTF8;
    private boolean extractUTF16;
    private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";

-    /**
-     * Determines if this extractor may only read particular types of content.
-     *
-     * Since Strings may be run on any content type, it is not content specific.
-     *
-     * @return false
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return false;
-    }
-
-    /**
-     * Determines if this extractor can read the content type.
-     *
-     * Note: Strings can be run on any type of content, so all types will return
-     * true.
-     *
-     * @param file           Content source to read
-     * @param detectedFormat Mimetype of source file.
-     *
-     * @return true
-     */
-    @Override
-    public boolean isSupported(Content file, String detectedFormat) {
-        return true;
-    }
-
    private final List<SCRIPT> extractScripts = new ArrayList<>();

    /**
@ -99,33 +69,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
        this.extractScripts.addAll(extractScripts);
    }

-    /**
-     * Get the currently used scripts for extraction
-     *
-     * @return scripts currently used or null if not supported
-     */
-    public List<SCRIPT> getScripts() {
-        return new ArrayList<>(extractScripts);
-    }
-
-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
-
-    /**
-     * Determines if this extractor should be run or not.
-     *
-     * Atleast one of the extraction encodings in DefaultExtractionConfig must
-     * be set for this extractor to run.
-     *
-     * @return Flag indicating if this extractor should be run.
-     */
-    @Override
-    public boolean isDisabled() {
-        return extractUTF8 == false && extractUTF16 == false;
-    }
-
    /**
     * Returns a reader that will iterate over the text of the content source.
     *
@ -136,8 +79,7 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
     * @throws
     * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
     */
-    @Override
-    public InputStreamReader getReader(Content content) throws TextExtractorException {
+    public InputStreamReader getReader(Content content) {
        InputStream stringStream = getInputStream(content);
        return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
    }
@ -160,7 +102,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
     *
     * @param context Instance containing config classes
     */
-    @Override
    public void setExtractionSettings(ExtractionContext context) {
        if (context != null && context.contains(DefaultExtractionConfig.class)) {
            DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@ -19,7 +19,6 @@
 package org.sleuthkit.autopsy.textextractors;

 import java.io.Reader;
-import org.sleuthkit.datamodel.SleuthkitVisitableItem;

 /**
 * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
@ -28,23 +27,19 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 * @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
 *            process.
 */
-public interface TextExtractor<T extends SleuthkitVisitableItem> {
+interface TextExtractor<T> {
    
-    /**
-     * Is this extractor configured such that no extraction will/should be done?
+     /**
+     * Determines if the file content is supported by the extractor if
+     * isContentTypeSpecific() returns true.
     *
-     * @return True if this extractor will/should not perform any extraction.
-     */
-    boolean isDisabled();
-
-    /**
-     * Log the given message and exception as a warning.
+     * @param file           to test if its content should be supported
+     * @param detectedFormat mime-type with detected format (such as text/plain)
+     *                       or null if not detected
     *
-     * @param msg Log message
-     * @param ex  Exception associated with the incoming message
+     * @return true if the file content is supported, false otherwise
     */
-    void logWarning(String msg, Exception ex);
-
+    public abstract boolean isSupported(T file, String detectedFormat);
    /**
     * Get a reader that will iterate over the text extracted from the given
     * source.
@ -53,28 +48,8 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
     *
     * @return Reader instance that contains the text of the source
     *
-     * @throws TextExtractorException
     */
-    Reader getReader(T source) throws TextExtractorException;
-       
-    /**
-     * Get the 'object' id of the given source.
-     *
-     * @param source Source content of type T
-     *
-     * @return Object id of the source content
-     */
-    long getID(T source);
-
-    /**
-     * Get a human readable name for the given source.
-     *
-     * @param source Source content of type T
-     *
-     * @return Name of the content source
-     */
-    String getName(T source);
-    
+    Reader getReader(T source) throws InitReaderException;
       
    /**
     * Determines how the extraction process will proceed given the settings 
@ -86,17 +61,17 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
     */
    void setExtractionSettings(ExtractionContext context);
    
-    /**
-     * System exception for dealing with errors encountered during extraction.
-     */
-    class TextExtractorException extends Exception {
-
-        public TextExtractorException(String message) {
-            super(message);
+    public class InitReaderException extends Exception {
+        public InitReaderException(String msg, Throwable ex) {
+            super(msg, ex);
        }
        
-        public TextExtractorException(String message, Throwable cause) {
-            super(message, cause);
+        public InitReaderException(Throwable ex) {
+            super(ex);
+        }
+        
+        public InitReaderException(String msg) {
+            super(msg);
        }
    }
 }
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@ -18,6 +18,7 @@
 */
 package org.sleuthkit.autopsy.textextractors;

+import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
@ -31,8 +32,13 @@ import org.sleuthkit.datamodel.Report;
 * See ContentTextExtractor interface for the generic structure of such
 * extractors.
 */
-public class TextExtractorFactory {
+public class TextReader {
    
+    private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
+                new HtmlTextExtractor<>(),
+                new SqliteTextExtractor<>(),
+                new TikaTextExtractor<>()
+        );
    /**
     * Auto detects the correct text extractor given the file.
     *
@ -41,40 +47,42 @@ public class TextExtractorFactory {
     * will keep the extractors at default settings. Refer to the
     * extractionconfigs package for available file configurations.
     *
-     * @param <T>     Type of source content
     * @param file    Content source that will be read from
     * @param context Contains extraction configurations for certain file types
     *
     * @return A ContentTextExtractor instance that is properly configured and
     *         can be read from the getReader() method.
     *
-     * @throws NoContentSpecificExtractorException In the event that the
+     * @throws NoReaderFoundException In the event that the
     *                                             inputted file and mimetype
     *                                             have no corresponding
     *                                             extractor
     */
-    public static <T extends Content> ContentTextExtractor<T> getContentSpecificExtractor(T file,
-            ExtractionContext context) throws NoContentSpecificExtractorException {
-        if (file instanceof AbstractFile) {
-            List<ContentTextExtractor<T>> fileExtractors = getAbstractFileExtractors();
-            String mimeType = ((AbstractFile) file).getMIMEType();
-            for (ContentTextExtractor<T> candidate : fileExtractors) {
-                candidate.setExtractionSettings(context);
-                if (candidate.isSupported(file, mimeType)) {
-                    return candidate;
+    public static Reader getContentSpecificReader(Content file,
+            ExtractionContext context) throws NoReaderFoundException {
+        try {
+            if (file instanceof AbstractFile) {
+                String mimeType = ((AbstractFile) file).getMIMEType();
+                for (TextExtractor<AbstractFile> candidate : fileExtractors) {
+                    candidate.setExtractionSettings(context);
+                    if (candidate.isSupported((AbstractFile)file, mimeType)) {
+                        return candidate.getReader((AbstractFile)file);
+                    }
                }
+            } else if (file instanceof BlackboardArtifact) {
+                TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor<>();
+                artifactExtractor.setExtractionSettings(context);
+                return artifactExtractor.getReader((BlackboardArtifact)file);
+            } else if (file instanceof Report) {
+                TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
+                reportExtractor.setExtractionSettings(context);
+                reportExtractor.getReader((Report)file);
            }
-        } else if (file instanceof BlackboardArtifact) {
-            ContentTextExtractor<T> artifactExtractor = new ArtifactTextExtractor<>();
-            artifactExtractor.setExtractionSettings(context);
-            return artifactExtractor;
-        } else if (file instanceof Report) {
-            ContentTextExtractor<T> reportExtractor = new TikaTextExtractor<>();
-            reportExtractor.setExtractionSettings(context);
-            return reportExtractor;
+        } catch (TextExtractor.InitReaderException ex) {
+            throw new NoReaderFoundException(ex);
        }
        
-        throw new NoContentSpecificExtractorException(
+        throw new NoReaderFoundException(
                String.format("Could not find a suitable extractor for "
                        + "file with name [%s] and id=[%d]. Try using the default, "
                        + "non content specific extractor as an alternative.",
@ -82,43 +90,34 @@ public class TextExtractorFactory {
        );
    }

-    /**
-     * Instantiates and returns a list of all of the known abstract file
-     * extractors.
-     *
-     * @return A list of specialized ContentTextExtractors
-     */
-    private static <T extends Content> List<ContentTextExtractor<T>> getAbstractFileExtractors() {
-        return Arrays.asList(
-                new HtmlTextExtractor<>(),
-                new SqliteTextExtractor<>(),
-                new TikaTextExtractor<>()
-        );
-    }
-
    /**
     * Returns the default extractor that can be run on any content type. This
     * extractor should be used as a backup in the event that no specialized
     * extractor can be found.
     *
+     * @param source
     * @param context Contains extraction configurations for certain file types
     *
     * @return A DefaultExtractor instance
     */
-    public static ContentTextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
-        ContentTextExtractor<Content> stringsInstance = new StringsTextExtractor<>();
+    public static Reader getDefaultReader(Content source, ExtractionContext context) {
+        StringsTextExtractor stringsInstance = new StringsTextExtractor();
        stringsInstance.setExtractionSettings(context);
-        return stringsInstance;
+        return stringsInstance.getReader(source);
    }

    /**
     * System level exception for handling content types that have no specific
     * strategy defined for extracting their text.
     */
-    public static class NoContentSpecificExtractorException extends Exception {
+    public static class NoReaderFoundException extends Exception {

-        public NoContentSpecificExtractorException(String msg) {
+        public NoReaderFoundException(String msg) {
            super(msg);
        }
+        
+        public NoReaderFoundException(Throwable ex) {
+            super(ex);
+        }
    }
 }
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@ -18,6 +18,7 @@
 */
 package org.sleuthkit.autopsy.textextractors;

+import com.google.common.collect.ImmutableList;
 import com.google.common.io.CharSource;
 import java.io.File;
 import java.io.IOException;
@ -54,7 +55,53 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 * Extracts text from Tika supported content. Protects against Tika
 * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
 */
-final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
+    
+      //Mimetype groups to aassist extractor implementations in ignoring binary and 
+    //archive files.
+    private static final List<String> BINARY_MIME_TYPES
+            = ImmutableList.of(
+                    //ignore binary blob data, for which string extraction will be used
+                    "application/octet-stream", //NON-NLS
+                    "application/x-msdownload"); //NON-NLS
+
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
+    private static final List<String> ARCHIVE_MIME_TYPES
+            = ImmutableList.of(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+
    
    private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
    
@ -74,11 +121,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
                    .map(mt -> mt.getType() + "/" + mt.getSubtype())
                    .collect(Collectors.toList());

-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        tikaLogger.log(Level.WARNING, msg, ex);
-    }
-
    /**
     * Returns a reader that will iterate over the text extracted from Apache 
     * Tika. 
@ -89,7 +131,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
     */
    @Override
-    public Reader getReader(Content content) throws TextExtractorException {
+    public Reader getReader(Content content) throws InitReaderException {
        ReadContentInputStream stream = new ReadContentInputStream(content);

        Metadata metadata = new Metadata();
@ -136,7 +178,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
            PushbackReader pushbackReader = new PushbackReader(tikaReader);
            int read = pushbackReader.read();
            if (read == -1) {
-                throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
+                throw new InitReaderException("Unable to extract text: Tika returned empty reader for " + content);
            }
            pushbackReader.unread(read);

@ -145,15 +187,13 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
            return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
        } catch (TimeoutException te) {
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
-            logWarning(msg, te);
-            throw new TextExtractorException(msg, te);
-        } catch (TextExtractorException ex) {
+            throw new InitReaderException(msg, te);
+        } catch (InitReaderException ex) {
            throw ex;
        } catch (Exception ex) {
            tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
-            logWarning(msg, ex);
-            throw new TextExtractorException(msg, ex);
+            throw new InitReaderException(msg, ex);
        } finally {
            future.cancel(true);
        }
@ -199,19 +239,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
                        ));
    }

-    /**
-     * Determines if this extractor only understands a specifc type of content.
-     * 
-     * Although Apache Tika is defined for many input types, it is still a content
-     * specific approach to extraction.
-     * 
-     * @return true
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
    /**
     * Determines if Tika is supported for this content type and mimetype.
     * 
@ -222,8 +249,8 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
    @Override
    public boolean isSupported(Content content, String detectedFormat) {
        if (detectedFormat == null
-                || ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
-                || ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
+                || BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
+                || ARCHIVE_MIME_TYPES.contains(detectedFormat)
                || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
                || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
                ) {
@ -232,19 +259,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
        return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
    }

-    /**
-     * Determines if this extractor can be run.
-     * 
-     * So long as Tika's dependencies are present, this extractor can run 
-     * no matter the circumstance.
-     * 
-     * @return true 
-     */
-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
    /**
     * Return timeout that should be used to index the content.
     *
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -19,6 +19,7 @@
 package org.sleuthkit.autopsy.keywordsearch;

 import java.io.BufferedReader;
+import java.io.Reader;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.logging.Level;
@ -32,7 +33,6 @@ import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
 import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
-import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
@ -106,8 +106,8 @@ class Ingester {
     * @throws IngesterException if there was an error processing a specific
     *                           artifact, but the Solr server is probably fine.
     */
-    void indexMetaDataOnly(BlackboardArtifact artifact, TextExtractor<Content> extractor) throws IngesterException {
-        indexChunk("", extractor.getName(artifact), getContentFields(artifact));
+    void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
+        indexChunk("", sourceName, getContentFields(artifact));
    }

    /**
@ -142,23 +142,12 @@ class Ingester {
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
    // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients 
-    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
-        final long sourceID = extractor.getID(source);
-        final String sourceName = extractor.getName(source);
-
+    < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
        int numChunks = 0; //unknown until chunking is done
        
-        if (extractor.isDisabled()) {
-            /*
-             * some Extractors, notable the strings extractor, have options
-             * which can be configured such that no extraction should be done
-             */
-            return true;
-        }
-
        Map<String, String> fields = getContentFields(source);
        //Get a reader for the content of the given source
-        try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
+        try (BufferedReader reader = new BufferedReader(sourceReader)) {
            Chunker chunker = new Chunker(reader);
            for (Chunk chunk : chunker) {
                if (context != null && context.fileIngestIsCancelled()) {
@ -173,18 +162,18 @@ class Ingester {
                    indexChunk(chunk.toString(), sourceName, fields);
                    numChunks++;
                } catch (Ingester.IngesterException ingEx) {
-                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+                    logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS

                    throw ingEx; //need to rethrow to signal error and move on
                }
            }
            if (chunker.hasException()) {
-                extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
+                logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
                return false;
            }
        } catch (Exception ex) {
-            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
+            logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } finally {
            if (context != null && context.fileIngestIsCancelled()) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -18,12 +18,14 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
+import com.google.common.collect.ImmutableList;
+import java.io.Reader;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
+import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
 import org.sleuthkit.autopsy.casemodule.Case;
@ -37,16 +39,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
 import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
 import org.sleuthkit.autopsy.ingest.IngestServices;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
+import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.ExtractionContext;
-import org.sleuthkit.autopsy.textextractors.TextExtractor;
-import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.TextReader;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskData;
 import org.sleuthkit.datamodel.TskData.FileKnown;

@ -68,6 +69,43 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
 })
 public final class KeywordSearchIngestModule implements FileIngestModule {
    
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
+    public static final List<String> ARCHIVE_MIME_TYPES
+            = ImmutableList.of(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+    
    /**
     * Options for this extractor
     */
@ -104,7 +142,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
    //accessed read-only by searcher thread

    private boolean startedSearching = false;
-    private TextExtractor<Content> stringExtractor;
+    private ExtractionContext stringsExtractionContext;
    private final KeywordSearchJobSettings settings;
    private boolean initialized = false;
    private long jobId;
@ -250,7 +288,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            }
        }

-        ExtractionContext extractionContext = new ExtractionContext();
+        stringsExtractionContext = new ExtractionContext();
        
        DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
        Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
@ -258,9 +296,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
        stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
        stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
        
-        extractionContext.set(DefaultExtractionConfig.class, stringsConfig);
+        stringsExtractionContext.set(DefaultExtractionConfig.class, stringsConfig);
        
-        stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
        indexer = new Indexer();
        initialized = true;
    }
@ -352,7 +389,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     * Common cleanup code when module stops or final searcher completes
     */
    private void cleanup() {
-        stringExtractor = null;
+        stringsExtractionContext = null;
        initialized = false;
    }

@ -440,7 +477,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         * @throws IngesterException exception thrown if indexing failed
         */
        private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            TextExtractor<Content> extractor = null;
            ExtractionContext extractionContext = new ExtractionContext();
            
            ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
@ -448,10 +484,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
            
            try {
-                extractor = TextExtractorFactory.getContentSpecificExtractor(aFile,extractionContext);
+                Reader specializedReader = TextReader.getContentSpecificReader(aFile,extractionContext);
                //divide into chunks and index
-                return Ingester.getDefault().indexText(extractor, aFile, context);
-            } catch (TextExtractorFactory.NoContentSpecificExtractorException ex) {
+                return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
+            } catch (TextReader.NoReaderFoundException ex) {
                //No text extractor found... run the default instead
                return false;
            }
@ -470,7 +506,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                if (context.fileIngestIsCancelled()) {
                    return true;
                }
-                if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
+                Reader stringsReader = TextReader.getDefaultReader(aFile, stringsExtractionContext);
+                if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                    return true;
                } else {
@ -530,7 +567,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {

            // we skip archive formats that are opened by the archive module. 
            // @@@ We could have a check here to see if the archive module was enabled though...
-            if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
+            if (ARCHIVE_MIME_TYPES.contains(fileType)) {
                try {
                    if (context.fileIngestIsCancelled()) {
                        return;
@ -579,11 +616,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                //should be ignored by the TextFileExtractor because they may contain more than one text encoding
                try {
                    TextFileExtractor textFileExtractor = new TextFileExtractor();
-                    if (Ingester.getDefault().indexText(textFileExtractor, aFile, context)) {
+                    Reader textReader = textFileExtractor.getReader(aFile);
+                    if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
                        putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
                        wasTextAdded = true;
                    }
-                } catch (IngesterException ex) {
+                } catch (IngesterException | TextFileExtractorException ex) {
                    logger.log(Level.WARNING, "Unable to index as unicode", ex);
                }
            }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;

 import java.io.File;
 import java.io.IOException;
+import java.io.Reader;
 import java.lang.reflect.InvocationTargetException;
 import java.net.InetAddress;
 import java.util.ArrayList;
@ -33,7 +34,6 @@ import org.apache.commons.lang.math.NumberUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
-import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
 import org.openide.util.lookup.ServiceProviders;
@ -46,8 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
 import org.sleuthkit.autopsy.progress.ProgressIndicator;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.autopsy.textextractors.TextExtractor;
-import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.TextReader;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
@ -115,22 +114,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
                return;
            }
            try {
-                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
-                        .getContentSpecificExtractor(content, null);
-                ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
-                ingester.indexText(contentSpecificExtractor, artifact, null);
-            } catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
+                Reader blackboardReader = TextReader
+                        .getContentSpecificReader(content, null);
+                String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
+                ingester.indexMetaDataOnly(artifact, sourceName);
+                ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
+            } catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
                throw new TskCoreException(ex.getCause().getMessage(), ex);
            }
        } else {
            try {
-                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
-                        .getContentSpecificExtractor(content, null);
-                ingester.indexText(contentSpecificExtractor, content, null);
-            } catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
+                Reader contentReader = TextReader
+                        .getContentSpecificReader(content, null);
+                ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
+            } catch (TextReader.NoReaderFoundException | Ingester.IngesterException ex) {
                try {
                    // Try the StringsTextExtractor if Tika extractions fails.
-                    ingester.indexText(TextExtractorFactory.getDefaultExtractor(null), content, null);
+                    ingester.indexText(TextReader.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
                } catch (Ingester.IngesterException ex1) {
                    throw new TskCoreException(ex.getCause().getMessage(), ex1);
                }
@ -444,11 +444,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
        final Ingester ingester = Ingester.getDefault();

        try {
-            TextExtractor<Content> contentSpecificExtractor = 
-                    TextExtractorFactory.getContentSpecificExtractor((Content) artifact, null);
-            ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
-            ingester.indexText(contentSpecificExtractor, artifact, null);
-        } catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
+            String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
+            Reader contentSpecificReader = 
+                    TextReader.getContentSpecificReader((Content) artifact, null);
+            ingester.indexMetaDataOnly(artifact, sourceName);
+            ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
+        } catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
            throw new TskCoreException(ex.getCause().getMessage(), ex);
        }
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@ -21,19 +21,15 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.BufferedInputStream;
 import java.io.Reader;
-import java.util.logging.Level;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
-import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;

 /**
 * Extract text from .txt files
 */
-final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
+final class TextFileExtractor {

    //Set a Minimum confidence value to reject matches that may not have a valid text encoding
    //Values of valid text encodings were generally 100, xml code sometimes had a value around 50, 
@ -41,47 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
    //This limited information was used to select the current value as one that would filter out clearly non-text 
    //files while hopefully working on all files with a valid text encoding
    static final private int MIN_MATCH_CONFIDENCE = 20;
-    static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());

-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
-    @Override
-    public boolean isSupported(AbstractFile file, String detectedFormat) {
-        return true;
-    }
-
-    @Override
-    public Reader getReader(AbstractFile source) throws TextExtractorException {
+    public Reader getReader(AbstractFile source) throws TextFileExtractorException {
        CharsetDetector detector = new CharsetDetector();
        //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
        InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
        try {
            detector.setText(stream);
        } catch (IOException ex) {
-            throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
+            throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
        }
        CharsetMatch match = detector.detect();
        if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
-            throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
+            throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
        }

        return match.getReader();
    }
    
-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
-    @Override
-    public void logWarning(String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex);
-    }
-
-    @Override
-    public void setExtractionSettings(ExtractionContext context) {
+    public class TextFileExtractorException extends Exception {
+        public TextFileExtractorException(String msg, Throwable ex) {
+            super(msg, ex);
+        }
+        public TextFileExtractorException(String msg) {
+            super(msg);
+        }
    }
 }