build out ArtifactExtractor

2025-07-17 18:17:43 +00:00 · 2016-12-13 00:02:03 +01:00 · 2016-12-13 00:02:03 +01:00 · 85af7c57b6
commit 85af7c57b6
parent 1a70a4e8b2
14 changed files with 503 additions and 267 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java
@ -6,10 +6,51 @@
 package org.sleuthkit.autopsy.keywordsearch;

 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.HashMap;
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.common.util.ContentStream;
+import org.openide.util.Exceptions;
+import org.sleuthkit.autopsy.casemodule.Case;
+import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.BlackboardAttribute;
+import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.SleuthkitCase;
+import org.sleuthkit.datamodel.TskCoreException;

-public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
+public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
+
+    static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
+        Content dataSource;
+        Case currentCase;
+        try {
+            currentCase = Case.getCurrentCase();
+        } catch (IllegalStateException ignore) {
+            // thorown by Case.getCurrentCase() if currentCase is null
+            return null;
+        }
+
+        SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
+        if (sleuthkitCase == null) {
+            return null;
+        }
+
+        AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
+        if (abstractFile != null) {
+
+            dataSource = abstractFile.getDataSource();
+        } else {
+            dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
+        }
+
+        if (dataSource == null) {
+            return null;
+        }
+        return dataSource;
+    }

    @Override
    boolean noExtractionOptionsAreEnabled() {
@ -27,13 +68,99 @@ public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
    }

    @Override
-    InputStream getInputStream(BlackboardArtifact source) {
-        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+    InputStream getInputStream(BlackboardArtifact artifact) {
+
+        // Concatenate the string values of all attributes into a single
+        // "content" string to be indexed.
+        StringBuilder artifactContents = new StringBuilder();
+        Content dataSource;
+        try {
+            dataSource = getDataSource(artifact);
+            if (dataSource == null) {
+                return null;
+            }
+
+            for (BlackboardAttribute attribute : artifact.getAttributes()) {
+                artifactContents.append(attribute.getAttributeType().getDisplayName());
+                artifactContents.append(" : ");
+
+                // This is ugly since it will need to updated any time a new
+                // TSK_DATETIME_* attribute is added. A slightly less ugly
+                // alternative would be to assume that all date time attributes
+                // will have a name of the form "TSK_DATETIME*" and check
+                // attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
+                // The major problem with that approach is that it would require
+                // a round trip to the database to get the type name string.
+                // We have also discussed modifying BlackboardAttribute.getDisplayString()
+                // to magically format datetime attributes but that is complicated by
+                // the fact that BlackboardAttribute exists in Sleuthkit data model
+                // while the utility to determine the timezone to use is in ContentUtils
+                // in the Autopsy datamodel.
+                if (attribute.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
+
+                    artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
+                } else {
+                    artifactContents.append(attribute.getDisplayString());
+                }
+                artifactContents.append(System.lineSeparator());
+            }
+        } catch (TskCoreException ex) {
+            Exceptions.printStackTrace(ex);
+            return null;
+        }
+        if (artifactContents.length() == 0) {
+            return null;
+        }
+
+        // To play by the rules of the existing text markup implementations,
+        // we need to (a) index the artifact contents in a "chunk" and
+        // (b) create a separate index entry for the base artifact.
+        // We distinguish artifact content from file content by applying a
+        // mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
+        // First, create an index entry for the base artifact.
+        HashMap<String, String> solrFields = new HashMap<>();
+        String documentId = Long.toString(artifact.getArtifactID());
+
+        solrFields.put(Server.Schema.ID.toString(), documentId);
+
+        // Set the IMAGE_ID field.
+        solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
+
+        // Next create the index entry for the document content.
+        // The content gets added to a single chunk. We may need to add chunking
+        // support later.
+        long chunkId = 1;
+
+        documentId += "_" + Long.toString(chunkId);
+        solrFields.replace(Server.Schema.ID.toString(), documentId);
+
+        return IOUtils.toInputStream(artifactContents);
+
    }

    @Override
    Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
-        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+        return new InputStreamReader(stream);
+    }
+
+    @Override
+    long getID(BlackboardArtifact source) {
+        return source.getArtifactID();
+    }
+
+    @Override
+    ContentStream getContentStream(byte[] encodedBytes, int length, BlackboardArtifact source) {
+        return new ByteArtifactStream(encodedBytes, length, source);
+    }
+
+    @Override
+    ContentStream getNullStream(BlackboardArtifact source) {
+        return new Ingester.NullArtifactStream(source);
+    }
+
+    @Override
+    String getName(BlackboardArtifact source) {
+        return source.getDisplayName();
    }

 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java
@ -0,0 +1,100 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import org.apache.solr.common.util.ContentStream;
+import org.openide.util.NbBundle;
+import org.sleuthkit.autopsy.coreutils.Logger;
+import static org.sleuthkit.autopsy.keywordsearch.Bundle.*;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+
+/**
+ * Stream of bytes representing string with specified encoding to feed into Solr
+ * as ContentStream
+ */
+class ByteArtifactStream implements ContentStream {
+
+    //input
+    private final byte[] content; //extracted subcontent
+    private long contentSize;
+    private final BlackboardArtifact aContent; //origin
+
+    private final InputStream stream;
+
+    private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
+
+    public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
+        this.content = content;
+        this.aContent = aContent;
+        stream = new ByteArrayInputStream(content, 0, (int) contentSize);
+    }
+
+    public byte[] getByteContent() {
+        return content;
+    }
+
+    public BlackboardArtifact getSourceContent() {
+        return aContent;
+    }
+
+    @Override
+    public String getContentType() {
+        return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
+    }
+
+    @Override
+    public String getName() {
+        return aContent.getDisplayName();
+    }
+
+    @Override
+    public Reader getReader() throws IOException {
+        return new InputStreamReader(stream);
+
+    }
+
+    @Override
+    public Long getSize() {
+        return contentSize;
+    }
+
+    @Override
+    @NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
+    public String getSourceInfo() {
+        return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
+    }
+
+    @Override
+    public InputStream getStream() throws IOException {
+        return stream;
+    }
+
+    @Override
+    protected void finalize() throws Throwable {
+        super.finalize();
+
+        stream.close();
+    }
+
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -0,0 +1,124 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2016 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.util.Arrays;
+import java.util.List;
+import org.apache.solr.common.util.ContentStream;
+import org.sleuthkit.datamodel.AbstractFile;
+
+/**
+ * Common methods for utilities that extract text and content and divide into
+ * chunks
+ */
+abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
+
+    /**
+     * Common options that can be used by some extractors
+     */
+    enum ExtractOptions {
+
+        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
+        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
+    };
+
+    static final List<String> BLOB_MIME_TYPES
+            = Arrays.asList(
+                    //ignore binary blob data, for which string extraction will be used
+                    "application/octet-stream", //NON-NLS
+                    "application/x-msdownload"); //NON-NLS
+
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
+    static final List<String> ARCHIVE_MIME_TYPES
+            = Arrays.asList(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+
+    /**
+     * Determines if the extractor works only for specified types is
+     * supportedTypes() or whether is a generic content extractor (such as
+     * string extractor)
+     *
+     * @return
+     */
+    abstract boolean isContentTypeSpecific();
+
+    /**
+     * Determines if the file content is supported by the extractor if
+     * isContentTypeSpecific() returns true.
+     *
+     * @param file           to test if its content should be supported
+     * @param detectedFormat mime-type with detected format (such as text/plain)
+     *                       or null if not detected
+     *
+     * @return true if the file content is supported, false otherwise
+     */
+    abstract boolean isSupported(AbstractFile file, String detectedFormat);
+
+    @Override
+    long getID(AbstractFile source) {
+        return source.getId();
+    }
+
+    @Override
+    ContentStream getContentStream(byte[] encodedBytes, int length, AbstractFile source) {
+        return new ByteContentStream(encodedBytes, length, source);
+    }
+
+    @Override
+    ContentStream getNullStream(AbstractFile source) {
+        return new Ingester.NullContentStream(source);
+    }
+
+    @Override
+    String getName(AbstractFile source) {
+        return source.getName();
+    }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 * divided into chunks and indexed with Solr. If HTML extraction succeeds,
 * chunks are indexed with Solr.
 */
-class HtmlTextExtractor extends TextExtractor<Void> {
+class HtmlTextExtractor extends FileTextExtractor<Void> {

    private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -36,14 +36,16 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.datamodel.AbstractContent;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.ContentVisitor;
 import org.sleuthkit.datamodel.DerivedFile;
 import org.sleuthkit.datamodel.Directory;
 import org.sleuthkit.datamodel.File;
 import org.sleuthkit.datamodel.LayoutFile;
 import org.sleuthkit.datamodel.LocalFile;
 import org.sleuthkit.datamodel.SlackFile;
+import org.sleuthkit.datamodel.SleuthkitItemVisitor;
+import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 import org.sleuthkit.datamodel.TskCoreException;

 /**
@ -99,6 +101,11 @@ class Ingester {
        indexContentStream(new NullContentStream(file), getContentFields(file), 0);
    }

+    void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
+
+//        indexContentStream(new NullContentStream(artifact), getContentFields(file), 0);
+    }
+
    /**
     * Sends a TextExtractor to Solr to have its content extracted and added to
     * the index. commit() should be called once you're done ingesting files.
@ -117,6 +124,12 @@ class Ingester {
        indexContentStream(new NullContentStream(file), params, 0);
    }

+    private void recordNumberOfChunks(BlackboardArtifact artifact, int numChunks) throws IngesterException {
+        Map<String, String> params = getContentFields(artifact);
+        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
+        indexContentStream(new NullArtifactStream(artifact), params, 0);
+    }
+
    /**
     * Creates a field map from FsContent, that is later sent to Solr
     *
@ -124,19 +137,14 @@ class Ingester {
     *
     * @return the map
     */
-    Map<String, String> getContentFields(AbstractContent fsc) {
+    Map<String, String> getContentFields(SleuthkitVisitableItem fsc) {
        return fsc.accept(getContentFieldsV);
    }

    /**
     * Visitor used to create param list to send to SOLR index.
     */
-    static private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
-
-        @Override
-        protected Map<String, String> defaultVisit(Content cntnt) {
-            return new HashMap<>();
-        }
+    static private class GetContentFieldsV extends SleuthkitItemVisitor.Default<Map<String, String>> {

        @Override
        public Map<String, String> visit(File f) {
@ -201,21 +209,46 @@ class Ingester {
            params.put(Server.Schema.FILE_NAME.toString(), af.getName());
            return params;
        }
+
+        @Override
+        public Map<String, String> visit(BlackboardArtifact artifact) {
+
+            Map<String, String> params = new HashMap<>();
+            params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
+            try {
+                Content dataSource = ArtifactExtractor.getDataSource(artifact);
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
+            } catch (TskCoreException ex) {
+                logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
+            }
+
+            return params;
+        }
+
+        @Override
+        protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
+            return new HashMap<>();
+        }
    }

    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
    private static final int SINGLE_READ_CHARS = 1024;
    private static final int EXTRA_CHARS = 128; //for whitespace

-    public <T> boolean indexText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
+    public <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
        int numChunks = 0; //unknown until chunking is done

        if (extractor.noExtractionOptionsAreEnabled()) {
            return true;
        }
-        T appendix = extractor.newAppendixProvider();
-        try (final InputStream stream = extractor.getInputStream(sourceFile);
-                Reader reader = extractor.getReader(stream, sourceFile, appendix);) {
+        final long sourceID = extractor.getID(source);
+        final String sourceName = extractor.getName(source);
+        Map<String, String> fields = getContentFields(source);
+
+        A appendix = extractor.newAppendixProvider();
+        try (final InputStream stream = extractor.getInputStream(source);
+                Reader reader = extractor.getReader(stream, source, appendix);) {

            //we read max 1024 chars at time, this seems to max what this Reader would return
            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
@ -265,10 +298,10 @@ class Ingester {

                //encode to bytes as UTF-8 to index as byte stream
                byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
-                String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
+
+                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                try {
-                    ByteContentStream bcs = new ByteContentStream(encodedBytes, encodedBytes.length, sourceFile);
-                    Map<String, String> fields = getContentFields(sourceFile);
+                    ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source);
                    try {
                        indexContentStream(bcs, fields, encodedBytes.length);
                    } catch (Exception ex) {
@ -277,20 +310,21 @@ class Ingester {
                    numChunks++;
                } catch (Ingester.IngesterException ingEx) {
                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
-                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS
+                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS

                    throw ingEx; //need to rethrow to signal error and move on
                }
            }
        } catch (IOException ex) {
-            extractor.logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } catch (Exception ex) {
-            extractor.logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
+            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } finally {
            //after all chunks, ingest the parent file without content itself, and store numChunks
-            recordNumberOfChunks(sourceFile, numChunks);
+            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
+            indexContentStream(extractor.getNullStream(source), fields, 0);
        }
        return true;
    }
@ -442,7 +476,7 @@ class Ingester {
    /**
     * ContentStream associated with FsContent, but forced with no content
     */
-    private static class NullContentStream implements ContentStream {
+    static class NullContentStream implements ContentStream {

        AbstractContent aContent;

@ -482,6 +516,50 @@ class Ingester {
        }
    }

+    /**
+     * ContentStream associated with Artifact, but forced with no content
+     */
+    static class NullArtifactStream implements ContentStream {
+
+        BlackboardArtifact aContent;
+
+        NullArtifactStream(BlackboardArtifact aContent) {
+            this.aContent = aContent;
+        }
+
+        @Override
+        public String getName() {
+            return aContent.getDisplayName();
+        }
+
+        @NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n")
+        @Override
+        public String getSourceInfo() {
+            return Bundle.Ingester_NullArtifactStream_getSrcInfo_text(aContent.getArtifactID());
+        }
+
+        @Override
+        public String getContentType() {
+            return null;
+        }
+
+        @Override
+        public Long getSize() {
+            return 0L;
+        }
+
+        @Override
+        public InputStream getStream() throws IOException {
+            return new ByteArrayInputStream(new byte[0]);
+        }
+
+        @Override
+        public Reader getReader() throws IOException {
+            throw new UnsupportedOperationException(
+                    NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
+        }
+    }
+
    /**
     * Indicates that there was an error with the specific ingest operation, but
     * it's still okay to continue ingesting files.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem

    private void reloadScriptsCheckBoxes() {
        boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));

        enableUTF16Checkbox.setSelected(utf16);

        boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
        enableUTF8Checkbox.setSelected(utf8);

        final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
        reloadScriptsCheckBoxes();

        boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));

        enableUTF16Checkbox.setSelected(utf16);

        boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
        enableUTF8Checkbox.setSelected(utf8);
        final boolean extractEnabled = utf16 || utf8;

@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem

    @Override
    public void store() {
-        KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
+        KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
                Boolean.toString(enableUTF8Checkbox.isSelected()));
-        KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
+        KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
                Boolean.toString(enableUTF16Checkbox.isSelected()));

        if (toUpdate != null) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
    //accessed read-only by searcher thread

    private boolean startedSearching = false;
-    private List<TextExtractor<?>> textExtractors;
+    private List<FileTextExtractor<?>> textExtractors;
    private StringsTextExtractor stringExtractor;
    private final KeywordSearchJobSettings settings;
    private boolean initialized = false;
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         * @throws IngesterException exception thrown if indexing failed
         */
        private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            TextExtractor extractor = null;
+            FileTextExtractor extractor = null;

            //go over available text extractors in order, and pick the first one (most specific one)
-            for (TextExtractor fe : textExtractors) {
+            for (FileTextExtractor fe : textExtractors) {
                if (fe.isSupported(aFile, detectedFormat)) {
                    extractor = fe;
                    break;
@ -514,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {

            // we skip archive formats that are opened by the archive module. 
            // @@@ We could have a check here to see if the archive module was enabled though...
-            if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
+            if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
                try {
                    if (context.fileIngestIsCancelled()) {
                        return;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
    }

    private void displayEncodings() {
-        String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
-        String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
+        String utf8 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
+        String utf16 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
        ArrayList<String> encodingsList = new ArrayList<>();
        if (utf8 == null || Boolean.parseBoolean(utf8)) {
            encodingsList.add("UTF8");
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
@ -211,14 +211,14 @@ class KeywordSearchSettings {
            KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
        }
        //setting default Extract UTF8
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
            logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
        }
        //setting default Extract UTF16
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
            logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
        }
        //setting default Latin-1 Script
        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@ -20,22 +20,14 @@ package org.sleuthkit.autopsy.keywordsearch;

 import java.io.IOException;
 import java.net.InetAddress;
-import java.util.HashMap;
 import java.util.MissingResourceException;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
-import org.apache.solr.common.util.ContentStreamBase.StringStream;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
-import org.sleuthkit.autopsy.casemodule.Case;
-import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
-import org.sleuthkit.datamodel.BlackboardAttribute;
-import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;

 /**
@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
    private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
    private static final int IS_REACHABLE_TIMEOUT_MS = 1000;

+    ArtifactExtractor extractor = new ArtifactExtractor();
+
    @Override
    public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
        if (artifact == null) {
@ -57,109 +51,18 @@ public class SolrSearchService implements KeywordSearchService {

        // We only support artifact indexing for Autopsy versions that use
        // the negative range for artifact ids.
-        long artifactId = artifact.getArtifactID();
-
-        if (artifactId > 0) {
+        if (artifact.getArtifactID() > 0) {
            return;
        }

-        Case currentCase;
-        try {
-            currentCase = Case.getCurrentCase();
-        } catch (IllegalStateException ignore) {
-            // thorown by Case.getCurrentCase() if currentCase is null
-            return;
-        }
-
-        SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
-        if (sleuthkitCase == null) {
-            return;
-        }
-
-        Content dataSource;
-        AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
-        if (abstractFile != null) {
-            dataSource = abstractFile.getDataSource();
-        } else {
-            dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
-        }
-
-        if (dataSource == null) {
-            return;
-        }
-
-        // Concatenate the string values of all attributes into a single 
-        // "content" string to be indexed.
-        StringBuilder artifactContents = new StringBuilder();
-
-        for (BlackboardAttribute attribute : artifact.getAttributes()) {
-            artifactContents.append(attribute.getAttributeType().getDisplayName());
-            artifactContents.append(" : ");
-
-            // This is ugly since it will need to updated any time a new
-            // TSK_DATETIME_* attribute is added. A slightly less ugly 
-            // alternative would be to assume that all date time attributes
-            // will have a name of the form "TSK_DATETIME*" and check
-            // attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
-            // The major problem with that approach is that it would require
-            // a round trip to the database to get the type name string.
-            // We have also discussed modifying BlackboardAttribute.getDisplayString()
-            // to magically format datetime attributes but that is complicated by
-            // the fact that BlackboardAttribute exists in Sleuthkit data model
-            // while the utility to determine the timezone to use is in ContentUtils
-            // in the Autopsy datamodel.
-            if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
-                    || attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
-
-                artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
-            } else {
-                artifactContents.append(attribute.getDisplayString());
-            }
-            artifactContents.append(System.lineSeparator());
-        }
-
-        if (artifactContents.length() == 0) {
-            return;
-        }
-
-        // To play by the rules of the existing text markup implementations,
-        // we need to (a) index the artifact contents in a "chunk" and 
-        // (b) create a separate index entry for the base artifact.
-        // We distinguish artifact content from file content by applying a 
-        // mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
-        // First, create an index entry for the base artifact.
-        HashMap<String, String> solrFields = new HashMap<>();
-        String documentId = Long.toString(artifactId);
-
-        solrFields.put(Server.Schema.ID.toString(), documentId);
-
-        // Set the IMAGE_ID field.
-        solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
-
        try {
-            Ingester.getDefault().indexContentStream(new StringStream(""), solrFields, 0);
+            Ingester.getDefault().indexMetaDataOnly(artifact);
        } catch (Ingester.IngesterException ex) {
            throw new TskCoreException(ex.getCause().getMessage(), ex);
        }

-        // Next create the index entry for the document content.
-        // The content gets added to a single chunk. We may need to add chunking
-        // support later.
-        long chunkId = 1;
-
-        documentId += "_" + Long.toString(chunkId);
-        solrFields.replace(Server.Schema.ID.toString(), documentId);
-
-        StringStream contentStream = new StringStream(artifactContents.toString());
-
        try {
-            Ingester.getDefault().indexContentStream(contentStream, solrFields, contentStream.getSize());
+            Ingester.getDefault().indexText(extractor, artifact);
        } catch (Ingester.IngesterException ex) {
            throw new TskCoreException(ex.getCause().getMessage(), ex);
        }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskException;
 * with the original source file) up to 1MB then and indexes chunks as text with
 * Solr.
 */
-class StringsTextExtractor extends TextExtractor<Void> {
+class StringsTextExtractor extends FileTextExtractor<Void> {

    private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
    private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
@ -94,8 +94,8 @@ class StringsTextExtractor extends TextExtractor<Void> {

    @Override
    boolean noExtractionOptionsAreEnabled() {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));

        return extractUTF8 == false && extractUTF16 == false;
    }
@ -120,8 +120,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
     */
    @Override
    InputStream getInputStream(AbstractFile sourceFile) {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));

        //check which extract stream to use
        InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2016 Basis Technology Corp.
+ * Copyright 2011-16 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -18,89 +18,30 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import java.util.Arrays;
-import java.util.List;
-import org.sleuthkit.datamodel.AbstractFile;
+import java.io.InputStream;
+import java.io.Reader;
+import org.apache.solr.common.util.ContentStream;
+import org.sleuthkit.datamodel.SleuthkitVisitableItem;

-/**
- * Common methods for utilities that extract text and content and divide into
- * chunks
- */
-abstract class TextExtractor<AppendixProvider> extends TextProvider<AppendixProvider, AbstractFile> {
-
-    /**
-     * Common options that can be used by some extractors
-     */
-    enum ExtractOptions {
-
-        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
-    };
-
-    static final List<String> BLOB_MIME_TYPES
-            = Arrays.asList(
-                    //ignore binary blob data, for which string extraction will be used
-                    "application/octet-stream", //NON-NLS
-                    "application/x-msdownload"); //NON-NLS
-
-    /** generally text extractors should ignore archives and let unpacking
-     * modules take care of them */
-    static final List<String> ARCHIVE_MIME_TYPES
-            = Arrays.asList(
-                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
-                    "application/x-7z-compressed", //NON-NLS
-                    "application/x-ace-compressed", //NON-NLS
-                    "application/x-alz-compressed", //NON-NLS
-                    "application/x-arj", //NON-NLS
-                    "application/vnd.ms-cab-compressed", //NON-NLS
-                    "application/x-cfs-compressed", //NON-NLS
-                    "application/x-dgc-compressed", //NON-NLS
-                    "application/x-apple-diskimage", //NON-NLS
-                    "application/x-gca-compressed", //NON-NLS
-                    "application/x-dar", //NON-NLS
-                    "application/x-lzx", //NON-NLS
-                    "application/x-lzh", //NON-NLS
-                    "application/x-rar-compressed", //NON-NLS
-                    "application/x-stuffit", //NON-NLS
-                    "application/x-stuffitx", //NON-NLS
-                    "application/x-gtar", //NON-NLS
-                    "application/x-archive", //NON-NLS
-                    "application/x-executable", //NON-NLS
-                    "application/x-gzip", //NON-NLS
-                    "application/zip", //NON-NLS
-                    "application/x-zoo", //NON-NLS
-                    "application/x-cpio", //NON-NLS
-                    "application/x-shar", //NON-NLS
-                    "application/x-tar", //NON-NLS
-                    "application/x-bzip", //NON-NLS
-                    "application/x-bzip2", //NON-NLS
-                    "application/x-lzip", //NON-NLS
-                    "application/x-lzma", //NON-NLS
-                    "application/x-lzop", //NON-NLS
-                    "application/x-z", //NON-NLS
-                    "application/x-compress"); //NON-NLS
-
-    /**
-     * Determines if the extractor works only for specified types is
-     * supportedTypes() or whether is a generic content extractor (such as
-     * string extractor)
-     *
-     * @return
-     */
-    abstract boolean isContentTypeSpecific();
-
-    /**
-     * Determines if the file content is supported by the extractor if
-     * isContentTypeSpecific() returns true.
-     *
-     * @param file           to test if its content should be supported
-     * @param detectedFormat mime-type with detected format (such as text/plain)
-     *                       or null if not detected
-     *
-     * @return true if the file content is supported, false otherwise
-     */
-    abstract boolean isSupported(AbstractFile file, String detectedFormat);
+abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {

+    abstract boolean noExtractionOptionsAreEnabled();

+    abstract void logWarning(final String msg, Exception ex);

+    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
+        //no-op
+    }
+
+    abstract AppendixProvider newAppendixProvider();
+
+    abstract InputStream getInputStream(TextSource source);
+
+    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
+
+    abstract long getID(TextSource source);
+
+    abstract ContentStream getContentStream(byte[] encodedBytes, int length, TextSource source);
+    abstract String getName(TextSource source);
+    abstract ContentStream getNullStream(TextSource source);
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextProvider.java
@ -1,39 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-16 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.InputStream;
-import java.io.Reader;
-
-abstract class TextProvider<AppendixProvider, TextSource> {
-
-    abstract boolean noExtractionOptionsAreEnabled();
-
-    abstract void logWarning(final String msg, Exception ex);
-
-    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
-        //no-op
-    }
-
-    abstract AppendixProvider newAppendixProvider();
-
-    abstract InputStream getInputStream(TextSource source);
-
-    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
-}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -49,7 +49,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 * parsers-supported content type.
 *
 */
-class TikaTextExtractor extends TextExtractor<Metadata> {
+class TikaTextExtractor extends FileTextExtractor<Metadata> {

    private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
@ -110,8 +110,8 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
    @Override
    public boolean isSupported(AbstractFile file, String detectedFormat) {
        if (detectedFormat == null
-                || TextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
-                || TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
+                || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
+                || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
                || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
                || detectedFormat.equals("application/x-font-ttf")) {   // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS

@ -123,6 +123,7 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
        return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
    }

+    @Override
    InputStream getInputStream(AbstractFile sourceFile1) {
        return new ReadContentInputStream(sourceFile1);
    }
@ -131,4 +132,5 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
    boolean noExtractionOptionsAreEnabled() {
        return false;
    }
+
 }