From 8b8196c21899a1c90730bdf585fd92b43fc6dc6c Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Thu, 15 Nov 2018 14:40:49 -0500
Subject: [PATCH 01/18] Did the refactoring and fleshing out, seems to be
 working, but will test further

---
 Core/ivy.xml                                  |   1 +
 Core/nbproject/project.properties             |   1 +
 Core/nbproject/project.xml                    |  31 ++---
 .../textextractors}/ContentTextExtractor.java |  13 ++-
 .../textextractors/ExtractionContext.java     |  28 +++++
 .../textextractors}/HtmlTextExtractor.java    |  27 +++--
 .../textextractors}/SqliteTextExtractor.java  |  13 ++-
 .../autopsy/textextractors}/SqliteUtil.java   |   2 +-
 .../textextractors}/StringsTextExtractor.java |  85 ++++++--------
 .../textextractors}/TextExtractor.java        |   8 +-
 .../textextractors/TextExtractorFactory.java  |  48 ++++++++
 .../textextractors}/TikaTextExtractor.java    |  25 ++--
 .../HTMLExtractionConfig.java                 |  22 ++++
 .../ImageFileExtractionConfig.java            |  22 ++++
 .../StringsExtractionConfig.java              |  43 +++++++
 .../TextFileExtractionConfig.java             |  22 ++++
 .../keywordsearch/ArtifactTextExtractor.java  |   1 +
 .../autopsy/keywordsearch/Ingester.java       |   2 +-
 ...wordSearchGlobalLanguageSettingsPanel.java |  13 ++-
 .../KeywordSearchIngestModule.java            | 108 ++++++++++++------
 .../KeywordSearchJobSettingsPanel.java        |   5 +-
 .../keywordsearch/KeywordSearchSettings.java  |   8 +-
 .../keywordsearch/SolrSearchService.java      |   2 +
 .../keywordsearch/TextFileExtractor.java      |  10 +-
 24 files changed, 398 insertions(+), 142 deletions(-)
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/ContentTextExtractor.java (92%)
 create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/HtmlTextExtractor.java (88%)
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/SqliteTextExtractor.java (97%)
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/SqliteUtil.java (99%)
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/StringsTextExtractor.java (92%)
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/TextExtractor.java (91%)
 create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/TikaTextExtractor.java (91%)
 create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java
 create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
 create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
 create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java
diff --git a/Core/ivy.xml b/Core/ivy.xml
index cbe45d8c33..dc1896882f 100644
--- a/Core/ivy.xml
+++ b/Core/ivy.xml
@@ -33,6 +33,7 @@
         <dependency conf="core->default" org="com.googlecode.plist" name="dd-plist" rev="1.20"/>
          
         <dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/>
+        <dependency conf="core->default" org="net.htmlparser.jericho" name="jericho-html" rev="3.3"/>
         
     </dependencies>
 </ivy-module>
diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties
index 89bf4a93d7..8dcea69608 100644
--- a/Core/nbproject/project.properties
+++ b/Core/nbproject/project.properties
@@ -6,6 +6,7 @@ file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4.
 file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar
 file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar
 file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar
+file.reference.jericho-html-3.3.jar=C:\\cygwin64\\home\\dsmyda\\autopsy\\Core\\release\\modules\\ext\\jericho-html-3.3.jar
 file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar
 file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar
 file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml
index 7a89519e34..2158785ecc 100644
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@@ -338,13 +338,10 @@
                 <package>org.sleuthkit.autopsy.modules.vmextractor</package>
                 <package>org.sleuthkit.autopsy.progress</package>
                 <package>org.sleuthkit.autopsy.report</package>
-                <package>org.sleuthkit.autopsy.tabulardatareader</package>
+                <package>org.sleuthkit.autopsy.textextractors</package>
+                <package>org.sleuthkit.autopsy.textextractors.extractionconfigs</package>
                 <package>org.sleuthkit.datamodel</package>
             </public-packages>
-            <class-path-extension>
-                <runtime-relative-path>ext/jackcess-2.2.0.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/jackcess-2.2.0.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin>
@@ -357,6 +354,18 @@
                 <runtime-relative-path>ext/cxf-rt-transports-http-3.0.16.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/cxf-rt-transports-http-3.0.16.jar</binary-origin>
             </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jackcess-2.2.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jackcess-2.2.0.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jericho-html-3.3.jar</runtime-relative-path>
+                <binary-origin>release\modules\ext\jericho-html-3.3.jar</binary-origin>
+            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/commons-validator-1.6.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/commons-validator-1.6.jar</binary-origin>
@@ -393,14 +402,14 @@
                 <runtime-relative-path>ext/sevenzipjbinding.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/sevenzipjbinding.jar</binary-origin>
             </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/mchange-commons-java-0.2.9.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/mchange-commons-java-0.2.9.jar</binary-origin>
             </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
+            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/cxf-core-3.0.16.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/cxf-core-3.0.16.jar</binary-origin>
@@ -481,10 +490,6 @@
                 <runtime-relative-path>ext/commons-pool2-2.4.2.jar</runtime-relative-path>
                 <binary-origin>release\modules\ext\commons-pool2-2.4.2.jar</binary-origin>
             </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/jsoup-1.10.3.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/jsoup-1.10.3.jar</binary-origin>
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
similarity index 92%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
index bba2df2ced..b97c1e3ab3 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import java.io.Reader;
 import java.util.Arrays;
@@ -27,9 +27,8 @@ import org.sleuthkit.datamodel.Content;
  * Common methods for utilities that extract text and content and divide into
  * chunks
  */
-abstract class ContentTextExtractor implements TextExtractor<Content> {
-
-
+public abstract class ContentTextExtractor implements TextExtractor<Content> {
+    
     static final List<String> BINARY_MIME_TYPES
             = Arrays.asList(
                     //ignore binary blob data, for which string extraction will be used
@@ -80,7 +79,7 @@ abstract class ContentTextExtractor implements TextExtractor<Content> {
      *
      * @return
      */
-    abstract boolean isContentTypeSpecific();
+    public abstract boolean isContentTypeSpecific();
 
     /**
      * Determines if the file content is supported by the extractor if
@@ -92,7 +91,9 @@ abstract class ContentTextExtractor implements TextExtractor<Content> {
      *
      * @return true if the file content is supported, false otherwise
      */
-    abstract boolean isSupported(Content file, String detectedFormat);
+    public abstract boolean isSupported(Content file, String detectedFormat);
+    
+    public abstract void parseContext(ExtractionContext context);
 
     @Override
     public abstract Reader getReader(Content source) throws TextExtractorException;
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
new file mode 100755
index 0000000000..d291164a2e
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
@@ -0,0 +1,28 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.textextractors;
+
+import com.google.common.collect.ClassToInstanceMap;
+import com.google.common.collect.MutableClassToInstanceMap;
+
+/**
+ *
+ * @author dsmyda
+ */
+public class ExtractionContext {
+    ClassToInstanceMap<Object> extractionConfigs;
+    
+    public ExtractionContext() {
+        extractionConfigs = MutableClassToInstanceMap.create();
+    }
+        
+    public <T> void set(Class<T> configClass, T configInstance) {
+        extractionConfigs.put(configClass, configInstance);
+    }	
+    public <T> T get(Class<T> configClass) {
+        return  (T) extractionConfigs.get(configClass);
+    }
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
similarity index 88%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index 32842dbc03..f834359514 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -16,13 +16,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Objects;
 import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Config;
@@ -32,16 +33,17 @@ import net.htmlparser.jericho.Source;
 import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.HTMLExtractionConfig;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
  * Extracts text from HTML content.
  */
-class HtmlTextExtractor extends ContentTextExtractor {
+public class HtmlTextExtractor extends ContentTextExtractor {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
-    private static final int MAX_SIZE = 50_000_000; //50MB
+    private int maxSize = 50_000_000;
 
     static final List<String> WEB_MIME_TYPES = Arrays.asList(
             "application/javascript", //NON-NLS
@@ -58,15 +60,16 @@ class HtmlTextExtractor extends ContentTextExtractor {
     }
 
     @Override
-    boolean isContentTypeSpecific() {
+    public boolean isContentTypeSpecific() {
         return true;
     }
 
     @Override
-    boolean isSupported(Content content, String detectedFormat) {
-        return detectedFormat != null
-                && WEB_MIME_TYPES.contains(detectedFormat)
-                && content.getSize() <= MAX_SIZE;
+    public boolean isSupported(Content content, String detectedFormat) {
+        boolean notNull = detectedFormat != null;
+        boolean supported = WEB_MIME_TYPES.contains(detectedFormat);
+        boolean size = content.getSize() <= maxSize;;
+        return notNull && supported && size;
     }
 
     @Override
@@ -177,4 +180,12 @@ class HtmlTextExtractor extends ContentTextExtractor {
     public void logWarning(final String msg, Exception ex) {
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
+
+    @Override
+    public void parseContext(ExtractionContext context) {
+        HTMLExtractionConfig configInstance = context.get(HTMLExtractionConfig.class);
+        if(Objects.nonNull(configInstance)) {
+            this.maxSize = configInstance.getContentSizeLimit();
+        }
+    }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
similarity index 97%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index c8bbe289e4..37cd2cca87 100755
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import com.google.common.io.CharSource;
 import java.io.IOException;
@@ -45,14 +45,14 @@ import org.sleuthkit.datamodel.TskCoreException;
  * tables with spaces in table name 3) Tika fails to include the table names in
  * output (except for the first table it parses)
  */
-class SqliteTextExtractor extends ContentTextExtractor {
+public class SqliteTextExtractor extends ContentTextExtractor {
 
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
     private static final CharSequence EMPTY_CHARACTER_SEQUENCE = "";
 
     @Override
-    boolean isContentTypeSpecific() {
+    public boolean isContentTypeSpecific() {
         return true;
     }
 
@@ -75,7 +75,7 @@ class SqliteTextExtractor extends ContentTextExtractor {
      * @return true if x-sqlite3
      */
     @Override
-    boolean isSupported(Content file, String detectedFormat) {
+    public boolean isSupported(Content file, String detectedFormat) {
         return SQLITE_MIMETYPE.equals(detectedFormat);
     }
 
@@ -106,6 +106,11 @@ class SqliteTextExtractor extends ContentTextExtractor {
         }
     }
 
+    @Override
+    public void parseContext(ExtractionContext context) {
+        //No settings.
+    }
+
     /**
      * Lazily loads tables from the database during reading to conserve memory.
      */
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteUtil.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteUtil.java
similarity index 99%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteUtil.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteUtil.java
index 08eefe7232..ae09972acf 100755
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteUtil.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteUtil.java
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import java.io.File;
 import java.io.IOException;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
similarity index 92%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 391c7d5a7c..869ab8a992 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -16,19 +16,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.StringsExtractionConfig;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
 import org.sleuthkit.datamodel.TskException;
@@ -36,21 +36,42 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-class StringsTextExtractor extends ContentTextExtractor {
-
-    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
-
+public class StringsTextExtractor extends ContentTextExtractor {
+    
+    private boolean extractUTF8;
+    private boolean extractUTF16;
+    
     /**
      * Options for this extractor
      */
-    enum ExtractOptions {
+    public enum ExtractOptions {
         EXTRACT_UTF16, ///< extract UTF16 text, true/false
         EXTRACT_UTF8, ///< extract UTF8 text, true/false
     };
 
-    private final List<SCRIPT> extractScripts = new ArrayList<>();
-    private Map<String, String> extractOptions = new HashMap<>();
+    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
 
+    @Override
+    public void parseContext(ExtractionContext context) {
+        StringsExtractionConfig configInstance = context.get(StringsExtractionConfig.class);
+        if(configInstance != null) {
+            extractUTF8 = configInstance.getExtractUTF8();
+            extractUTF16 = configInstance.getExtractUTF16();
+        }
+    }
+
+    @Override
+    public boolean isContentTypeSpecific() {
+        return true;
+    }
+
+    @Override
+    public boolean isSupported(Content file, String detectedFormat) {
+        return true;
+    }
+
+    private final List<SCRIPT> extractScripts = new ArrayList<>();
+    
     public StringsTextExtractor() {
         //LATIN_2 is the default script
         extractScripts.add(SCRIPT.LATIN_2);
@@ -75,25 +96,6 @@ class StringsTextExtractor extends ContentTextExtractor {
         return new ArrayList<>(extractScripts);
     }
 
-    /**
-     * Get current options
-     *
-     * @return currently used, extractor specific options, or null of not
-     *         supported
-     */
-    public Map<String, String> getOptions() {
-        return extractOptions;
-    }
-
-    /**
-     * Set extractor specific options
-     *
-     * @param options options to use
-     */
-    public void setOptions(Map<String, String> options) {
-        this.extractOptions = options;
-    }
-
     @Override
     public void logWarning(final String msg, Exception ex) {
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
@@ -101,16 +103,13 @@ class StringsTextExtractor extends ContentTextExtractor {
 
     @Override
     public boolean isDisabled() {
-        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
-        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
-
         return extractUTF8 == false && extractUTF16 == false;
     }
 
     @Override
     public InputStreamReader getReader(Content content) throws TextExtractorException {
         InputStream stringStream = getInputStream(content);
-        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
+        return new InputStreamReader(stringStream, Charset.forName("UTF-8"));
     }
 
     InputStream getInputStream(Content content) {
@@ -118,24 +117,10 @@ class StringsTextExtractor extends ContentTextExtractor {
         if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
             return new EnglishOnlyStream(content);//optimal for english, english only
         } else {
-            boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
-            boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
-
             return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
         }
     }
-
-    @Override
-    public boolean isContentTypeSpecific() {
-        return false;
-    }
-
-    @Override
-    public boolean isSupported(Content content, String detectedFormat) {
-        // strings can be run on anything. 
-        return true;
-    }
-
+    
     /**
      * Content input string stream reader/converter - given Content,
      * extract strings from it and return encoded bytes via read()
@@ -328,7 +313,7 @@ class StringsTextExtractor extends ContentTextExtractor {
         private int copyToReturn(byte[] b, int off, long len) {
             final String curStringS = curString.toString();
             //logger.log(Level.INFO, curStringS);
-            byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+            byte[] stringBytes = curStringS.getBytes(Charset.forName("UTF-8"));
             System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
             //logger.log(Level.INFO, curStringS);
             //copied all string, reset
@@ -499,7 +484,7 @@ class StringsTextExtractor extends ContentTextExtractor {
          */
         private void convert(int numBytes) {
             lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
-            convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
+            convertBuff = lastExtractResult.getText().getBytes(Charset.forName("UTF-8"));
             //reset tracking vars
             if (lastExtractResult.getNumBytes() == 0) {
                 bytesInConvertBuff = 0;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
similarity index 91%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index 94abb940eb..1efed82c05 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import java.io.Reader;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
@@ -28,7 +28,7 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
  * @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
  *                     is able to process.
  */
-interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
+public interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
 
     /**
      * Is this extractor configured such that no extraction will/should be done?
@@ -48,12 +48,10 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     /**
      * Get a reader that over the text extracted from the given source.
      *
-     * @param stream
      * @param source
      *
      * @return
-     *
-     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
     abstract Reader getReader(TextSource source) throws TextExtractorException;
 
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
new file mode 100755
index 0000000000..453c6900ee
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -0,0 +1,48 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.textextractors;
+
+import com.google.common.collect.ImmutableList;
+import org.sleuthkit.datamodel.AbstractFile;
+
+/**
+ *
+ * @author dsmyda
+ */
+public class TextExtractorFactory {
+    private static final ImmutableList<ContentTextExtractor> extractors = 
+                ImmutableList.of(new HtmlTextExtractor(), 
+                                 new SqliteTextExtractor(),
+                                 new TikaTextExtractor(),
+                                 new StringsTextExtractor());
+        
+    /**
+     * Auto detects the corrent text extractor given the file and mimetype. Context 
+     * 
+     * @param file
+     * @param mimeType
+     * @param context
+     * @return 
+     */
+    public static ContentTextExtractor getExtractor(AbstractFile file, String mimeType, ExtractionContext context) {
+        ContentTextExtractor extractorInstance = null;
+
+        for(ContentTextExtractor candidate : extractors) {
+            candidate.parseContext(context);
+            if(candidate.isSupported(file, mimeType)) {
+                try {
+                    extractorInstance = candidate.getClass().newInstance();
+                    extractorInstance.parseContext(context);
+                    break;
+                } catch (InstantiationException | IllegalAccessException ex) {
+                    
+                }
+            }
+        }
+        
+        return extractorInstance;
+    }
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
similarity index 91%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 2b1d24f89b..ddfed5a776 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import com.google.common.io.CharSource;
 import java.io.File;
@@ -25,6 +25,7 @@ import java.io.PushbackReader;
 import java.io.Reader;
 import java.nio.file.Paths;
 import java.util.List;
+import java.util.Objects;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
@@ -44,8 +45,8 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.openide.util.NbBundle;
 import org.openide.modules.InstalledFileLocator;
-import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.PlatformUtil;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
@@ -53,9 +54,11 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * Extracts text from Tika supported content. Protects against Tika
  * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
  */
-class TikaTextExtractor extends ContentTextExtractor {
+public class TikaTextExtractor extends ContentTextExtractor {
+    
+    private boolean OCREnabled;
 
-    static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
+    private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
 
@@ -73,7 +76,7 @@ class TikaTextExtractor extends ContentTextExtractor {
 
     @Override
     public void logWarning(final String msg, Exception ex) {
-        KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
+        tikaLogger.log(Level.WARNING, msg, ex);
     }
 
     @Override
@@ -92,7 +95,7 @@ class TikaTextExtractor extends ContentTextExtractor {
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
         
         // configure OCR if it is enabled in KWS settings and installed on the machine
-        if (TESSERACT_PATH != null && KeywordSearchSettings.getOcrOption() && PlatformUtil.isWindowsOS() == true) {
+        if (TESSERACT_PATH != null && OCREnabled && PlatformUtil.isWindowsOS() == true) {
             
             // configure PDFParser. 
             PDFParserConfig pdfConfig = new PDFParserConfig();
@@ -138,7 +141,7 @@ class TikaTextExtractor extends ContentTextExtractor {
         } catch (TextExtractorException ex) {
             throw ex;
         } catch (Exception ex) {
-            KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
+            tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
             logWarning(msg, ex);
             throw new TextExtractorException(msg, ex);
@@ -233,6 +236,14 @@ class TikaTextExtractor extends ContentTextExtractor {
 
     }
 
+    @Override
+    public void parseContext(ExtractionContext context) {
+        ImageFileExtractionConfig configInstance = context.get(ImageFileExtractionConfig.class);
+        if(Objects.nonNull(configInstance)) {
+            this.OCREnabled = configInstance.getOCREnabled();
+        }
+    }
+
     /**
      * An implementation of CharSource that just wraps an existing reader and
      * returns it in openStream().
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java
new file mode 100755
index 0000000000..6a15d57165
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java
@@ -0,0 +1,22 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.textextractors.extractionconfigs;
+
+/**
+ *
+ * @author dsmyda
+ */
+public class HTMLExtractionConfig {
+    private int contentSizeLimit;
+        
+    public void setContentSizeLimit(int size) {
+        this.contentSizeLimit = size;
+    }
+
+    public int getContentSizeLimit() {
+        return this.contentSizeLimit;
+    }
+}
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
new file mode 100755
index 0000000000..1aa320b84a
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
@@ -0,0 +1,22 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.textextractors.extractionconfigs;
+
+/**
+ *
+ * @author dsmyda
+ */
+public class ImageFileExtractionConfig {
+    private boolean OCREnabled;
+        
+    public void setOCREnabled(boolean enabled) {
+        this.OCREnabled = enabled;
+    }
+
+    public boolean getOCREnabled() {
+        return this.OCREnabled;
+    }
+}
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
new file mode 100755
index 0000000000..1850552077
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
@@ -0,0 +1,43 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.textextractors.extractionconfigs;
+
+import java.util.List;
+import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
+
+/**
+ *
+ * @author dsmyda
+ */
+public class StringsExtractionConfig {
+    private boolean extractUTF8;
+    private boolean extractUTF16;
+    private List<SCRIPT> extractScripts;
+
+    public void setExtractUTF8(boolean enabled) {
+        this.extractUTF8 = enabled;
+    }
+
+    public void setExtractUTF16(boolean enabled) {
+        this.extractUTF16 = enabled;
+    }
+
+    public boolean getExtractUTF8() {
+        return extractUTF8;
+    }
+
+    public boolean getExtractUTF16() { 
+        return extractUTF16;
+    }
+    
+    public void setExtractScripts(List<SCRIPT> scripts) {
+        this.extractScripts = scripts;
+    }
+    
+    public List<SCRIPT> getExtractScripts() {
+        return this.extractScripts;
+    }
+}
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java
new file mode 100755
index 0000000000..b2cb62d728
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java
@@ -0,0 +1,22 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.textextractors.extractionconfigs;
+
+/**
+ *
+ * @author dsmyda
+ */
+public class TextFileExtractionConfig {
+    private int minConfidenceInCharsetDetection;
+        
+    public void setMinConfidenceInCharsetDetection(int conf) {
+        this.minConfidenceInCharsetDetection = conf;
+    }
+
+    public int getMinConfidenceInCharsetDetection() {
+        return this.minConfidenceInCharsetDetection;
+    }
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
index a2724e72da..cb01961c6a 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@@ -28,6 +28,7 @@ import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index c855edfed6..459bf3710a 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -32,6 +32,7 @@ import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
 import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
@@ -58,7 +59,6 @@ class Ingester {
     private final Server solrServer = KeywordSearch.getServer();
     private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
     private static Ingester instance;
-    private static final int SINGLE_READ_CHARS = 512;
 
     private Ingester() {
     }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
index 76f80026e0..8a5745c974 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
@@ -18,6 +18,7 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
 import java.awt.EventQueue;
 import java.awt.GridLayout;
 import java.awt.event.ActionEvent;
@@ -125,12 +126,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     private void reloadScriptsCheckBoxes() {
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF16"));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF8"));
         enableUTF8Checkbox.setSelected(utf8);
 
         boolean ocr = KeywordSearchSettings.getOcrOption();
@@ -152,12 +153,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
         reloadScriptsCheckBoxes();
 
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF16"));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF8"));
         enableUTF8Checkbox.setSelected(utf8);
         final boolean extractEnabled = utf16 || utf8;
 
@@ -316,9 +317,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     @Override
     public void store() {
-        KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
+        KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF8",
                 Boolean.toString(enableUTF8Checkbox.isSelected()));
-        KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
+        KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF16",
                 Boolean.toString(enableUTF16Checkbox.isSelected()));
         KeywordSearchSettings.setOcrOption(enableOcrCheckbox.isSelected());
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 8b9a71b207..ff44def95b 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -18,7 +18,8 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import java.util.ArrayList;
+import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -40,6 +41,12 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
+import org.sleuthkit.autopsy.textextractors.ExtractionContext;
+import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.HTMLExtractionConfig;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.StringsExtractionConfig;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.TskData;
 import org.sleuthkit.datamodel.TskData.FileKnown;
@@ -61,6 +68,47 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
     "CannotRunFileTypeDetection=Unable to run file type detection."
 })
 public final class KeywordSearchIngestModule implements FileIngestModule {
+    
+    static final List<String> ARCHIVE_MIME_TYPES
+            = Arrays.asList(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+    
+    static final List<String> BINARY_MIME_TYPES
+            = Arrays.asList(
+                    //ignore binary blob data, for which string extraction will be used
+                    "application/octet-stream", //NON-NLS
+                    "application/x-msdownload"); //NON-NLS
 
     enum UpdateFrequency {
 
@@ -89,13 +137,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private List<ContentTextExtractor> textExtractors;
     private StringsTextExtractor stringExtractor;
-    private TextFileExtractor txtFileExtractor;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
     private long jobId;
-    private long dataSourceId;
     private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
     private int instanceNum = 0;
     private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
@@ -152,7 +197,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     public void startUp(IngestJobContext context) throws IngestModuleException {
         initialized = false;
         jobId = context.getJobId();
-        dataSourceId = context.getDataSource().getId();
 
         Server server = KeywordSearch.getServer();
         if (server.coreIsOpen() == false) {
@@ -239,20 +283,18 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             }
         }
 
-        //initialize extractors
+        ExtractionContext extractionContext = new ExtractionContext();
+        
+        StringsExtractionConfig stringsConfig = new StringsExtractionConfig();
+        Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
+        stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get("EXTRACT_UTF8")));
+        stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get("EXTRACT_UTF16")));
+        stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
+        
+        extractionContext.set(StringsExtractionConfig.class, stringsConfig);
+        
         stringExtractor = new StringsTextExtractor();
-        stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
-        stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
-
-        txtFileExtractor = new TextFileExtractor();
-
-        textExtractors = new ArrayList<>();
-        //order matters, more specific extractors first
-        textExtractors.add(new HtmlTextExtractor());
-        //Add sqlite text extractor to be default for sqlite files, since tika stuggles 
-        //with them. See SqliteTextExtractor class for specifics
-        textExtractors.add(new SqliteTextExtractor());
-        textExtractors.add(new TikaTextExtractor());
+        stringExtractor.parseContext(extractionContext);
 
         indexer = new Indexer();
         initialized = true;
@@ -345,10 +387,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
      * Common cleanup code when module stops or final searcher completes
      */
     private void cleanup() {
-        textExtractors.clear();
-        textExtractors = null;
         stringExtractor = null;
-        txtFileExtractor = null;
         initialized = false;
     }
 
@@ -437,16 +476,18 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
             ContentTextExtractor extractor = null;
-
-            //go over available text extractors in order, and pick the first one (most specific one)
-            for (ContentTextExtractor fe : textExtractors) {
-                if (fe.isSupported(aFile, detectedFormat)) {
-                    extractor = fe;
-                    break;
-                }
-            }
-
-            if (extractor == null) {
+            ExtractionContext extractionContext = new ExtractionContext();
+            
+            ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
+            imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
+            extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
+            
+            HTMLExtractionConfig htmlConfig = new HTMLExtractionConfig();
+            htmlConfig.setContentSizeLimit(50000000); //50 MB
+            extractionContext.set(HTMLExtractionConfig.class, htmlConfig);
+            
+            extractor = TextExtractorFactory.getExtractor(aFile, detectedFormat, extractionContext);
+            if (extractor.getClass().getName().equals("StringsTextExtractor")) {
                 // No text extractor found.
                 return false;
             }
@@ -529,7 +570,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
 
             // we skip archive formats that are opened by the archive module. 
             // @@@ We could have a check here to see if the archive module was enabled though...
-            if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
+            if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(fileType)) {
                 try {
                     if (context.fileIngestIsCancelled()) {
                         return;
@@ -577,7 +618,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 //Carved Files should be the only type of unallocated files capable of a txt extension and 
                 //should be ignored by the TextFileExtractor because they may contain more than one text encoding
                 try {
-                    if (Ingester.getDefault().indexText(txtFileExtractor, aFile, context)) {
+                    TextFileExtractor textFileExtractor = new TextFileExtractor();
+                    if (Ingester.getDefault().indexText(textFileExtractor, aFile, context)) {
                         putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
                         wasTextAdded = true;
                     }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
index c80e66d947..10ca43b68b 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
@@ -18,6 +18,7 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
 import java.beans.PropertyChangeEvent;
 import java.beans.PropertyChangeListener;
 import java.util.ArrayList;
@@ -102,8 +103,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
     }
 
     private void displayEncodings() {
-        String utf8 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
-        String utf16 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
+        String utf8 = KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF8");
+        String utf16 = KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF16");
         ArrayList<String> encodingsList = new ArrayList<>();
         if (utf8 == null || Boolean.parseBoolean(utf8)) {
             encodingsList.add("UTF8");
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
index 4350ce3085..d3a61cb876 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
@@ -234,14 +234,14 @@ class KeywordSearchSettings {
             KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
         }
         //setting default Extract UTF8
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, "EXTRACT_UTF8")) {
             logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF8", Boolean.TRUE.toString());
         }
         //setting default Extract UTF16
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, "EXTRACT_UTF16")) {
             logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF16", Boolean.TRUE.toString());
         }
         //setting OCR default (disabled by default)
         if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, OCR_ENABLED)) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 1313b9a3ef..8cf37859cd 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -45,6 +45,8 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
 import org.sleuthkit.autopsy.progress.ProgressIndicator;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
+import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
+import org.sleuthkit.autopsy.textextractors.TikaTextExtractor;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
index b7f3a885b5..2156b35b7e 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@@ -25,6 +25,8 @@ import java.util.logging.Level;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
+import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
@@ -42,12 +44,12 @@ final class TextFileExtractor extends ContentTextExtractor {
     static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
 
     @Override
-    boolean isContentTypeSpecific() {
+    public boolean isContentTypeSpecific() {
         return true;
     }
 
     @Override
-    boolean isSupported(Content file, String detectedFormat) {
+    public boolean isSupported(Content file, String detectedFormat) {
         return true;
     }
 
@@ -79,4 +81,8 @@ final class TextFileExtractor extends ContentTextExtractor {
         logger.log(Level.WARNING, msg, ex);
     }
 
+    @Override
+    public void parseContext(ExtractionContext context) {
+        //Nothing
+    }
 }

From fec3e0360b908b2c66944cc2b628cbf3142c7a69 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Mon, 3 Dec 2018 13:04:45 -0500
Subject: [PATCH 02/18] Fixed a number of bugs with the refactor

---
 Core/nbproject/project.properties             |  4 ++-
 Core/nbproject/project.xml                    |  8 ++++++
 .../textextractors/StringsTextExtractor.java  |  1 +
 .../textextractors/TextExtractorFactory.java  | 27 +++++++++++++------
 .../textextractors/TikaTextExtractor.java     |  5 ++++
 .../KeywordSearchIngestModule.java            | 22 ++++++---------
 6 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties
index 8dcea69608..0833626f85 100644
--- a/Core/nbproject/project.properties
+++ b/Core/nbproject/project.properties
@@ -1,4 +1,6 @@
 file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar
+file.reference.apache-mime4j-core-0.8.1.jar=C:\\cygwin64\\home\\dsmyda\\autopsy\\Core\\release\\modules\\ext\\apache-mime4j-core-0.8.1.jar
+file.reference.apache-mime4j-dom-0.8.1.jar=C:\\cygwin64\\home\\dsmyda\\autopsy\\Core\\release\\modules\\ext\\apache-mime4j-dom-0.8.1.jar
 file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar
 file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar
 file.reference.commons-dbcp2-2.1.1.jar=release\\modules\\ext\\commons-dbcp2-2.1.1.jar
@@ -6,7 +8,7 @@ file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4.
 file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar
 file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar
 file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar
-file.reference.jericho-html-3.3.jar=C:\\cygwin64\\home\\dsmyda\\autopsy\\Core\\release\\modules\\ext\\jericho-html-3.3.jar
+file.reference.jericho-html-3.3.jar=release\\modules\\ext\\jericho-html-3.3.jar
 file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar
 file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar
 file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml
index 37af0be24b..2cc68bf73b 100644
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@@ -347,6 +347,10 @@
                 <runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin>
             </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/apache-mime4j-dom-0.8.1.jar</runtime-relative-path>
+                <binary-origin>C:\cygwin64\home\dsmyda\autopsy\Core\release\modules\ext\apache-mime4j-dom-0.8.1.jar</binary-origin>
+            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/jdom-2.0.5.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/jdom-2.0.5.jar</binary-origin>
@@ -431,6 +435,10 @@
                 <runtime-relative-path>ext/metadata-extractor-2.10.1.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/metadata-extractor-2.10.1.jar</binary-origin>
             </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/apache-mime4j-core-0.8.1.jar</runtime-relative-path>
+                <binary-origin>C:\cygwin64\home\dsmyda\autopsy\Core\release\modules\ext\apache-mime4j-core-0.8.1.jar</binary-origin>
+            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/tika-core-1.17.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/tika-core-1.17.jar</binary-origin>
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 869ab8a992..a3647197c6 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -57,6 +57,7 @@ public class StringsTextExtractor extends ContentTextExtractor {
         if(configInstance != null) {
             extractUTF8 = configInstance.getExtractUTF8();
             extractUTF16 = configInstance.getExtractUTF16();
+            setScripts(configInstance.getExtractScripts());
         }
     }
 
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index 453c6900ee..f61732ad97 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -16,8 +16,7 @@ public class TextExtractorFactory {
     private static final ImmutableList<ContentTextExtractor> extractors = 
                 ImmutableList.of(new HtmlTextExtractor(), 
                                  new SqliteTextExtractor(),
-                                 new TikaTextExtractor(),
-                                 new StringsTextExtractor());
+                                 new TikaTextExtractor());
         
     /**
      * Auto detects the corrent text extractor given the file and mimetype. Context 
@@ -26,23 +25,35 @@ public class TextExtractorFactory {
      * @param mimeType
      * @param context
      * @return 
+     * @throws org.sleuthkit.autopsy.textextractors.TextExtractorFactory.NoSpecializedExtractorException 
      */
-    public static ContentTextExtractor getExtractor(AbstractFile file, String mimeType, ExtractionContext context) {
-        ContentTextExtractor extractorInstance = null;
-
+    public static ContentTextExtractor getSpecializedExtractor(AbstractFile file, 
+            String mimeType, ExtractionContext context) throws NoSpecializedExtractorException {
         for(ContentTextExtractor candidate : extractors) {
             candidate.parseContext(context);
             if(candidate.isSupported(file, mimeType)) {
                 try {
-                    extractorInstance = candidate.getClass().newInstance();
+                    ContentTextExtractor extractorInstance = candidate.getClass().newInstance();
                     extractorInstance.parseContext(context);
-                    break;
+                    return extractorInstance;
                 } catch (InstantiationException | IllegalAccessException ex) {
                     
                 }
             }
         }
         
-        return extractorInstance;
+        throw new NoSpecializedExtractorException("Could not find a suitable extractor for mimetype ["+mimeType+"]");
+    }
+    
+    public static StringsTextExtractor getDefaultExtractor(ExtractionContext context) {
+        StringsTextExtractor instance = new StringsTextExtractor();
+        instance.parseContext(context);
+        return instance;
+    }
+    
+    public static class NoSpecializedExtractorException extends Exception {
+        public NoSpecializedExtractorException(String msg) {
+            super(msg);
+        }
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index ddfed5a776..6ec09d2333 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -32,6 +32,7 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
+import java.util.logging.Logger;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.tika.Tika;
@@ -46,6 +47,7 @@ import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.openide.util.NbBundle;
 import org.openide.modules.InstalledFileLocator;
 import org.sleuthkit.autopsy.coreutils.PlatformUtil;
+import org.sleuthkit.autopsy.ingest.IngestServices;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@@ -57,6 +59,8 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 public class TikaTextExtractor extends ContentTextExtractor {
     
     private boolean OCREnabled;
+    private final IngestServices services = IngestServices.getInstance();
+    private final Logger logger = services.getLogger(TikaTextExtractor.class.getName());
 
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
@@ -141,6 +145,7 @@ public class TikaTextExtractor extends ContentTextExtractor {
         } catch (TextExtractorException ex) {
             throw ex;
         } catch (Exception ex) {
+            logger.log(Level.SEVERE, "",ex);
             tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
             logWarning(msg, ex);
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index ff44def95b..451e5300b2 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -25,6 +25,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
+import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
 import org.sleuthkit.autopsy.casemodule.Case;
@@ -293,9 +294,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         
         extractionContext.set(StringsExtractionConfig.class, stringsConfig);
         
-        stringExtractor = new StringsTextExtractor();
-        stringExtractor.parseContext(extractionContext);
-
+        stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
         indexer = new Indexer();
         initialized = true;
     }
@@ -482,19 +481,14 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
             extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
             
-            HTMLExtractionConfig htmlConfig = new HTMLExtractionConfig();
-            htmlConfig.setContentSizeLimit(50000000); //50 MB
-            extractionContext.set(HTMLExtractionConfig.class, htmlConfig);
-            
-            extractor = TextExtractorFactory.getExtractor(aFile, detectedFormat, extractionContext);
-            if (extractor.getClass().getName().equals("StringsTextExtractor")) {
-                // No text extractor found.
+            try {
+                extractor = TextExtractorFactory.getSpecializedExtractor(aFile, detectedFormat, extractionContext);
+                //divide into chunks and index
+                return Ingester.getDefault().indexText(extractor, aFile, context);
+            } catch (TextExtractorFactory.NoSpecializedExtractorException ex) {
+                //No text extractor found... run the default instead
                 return false;
             }
-
-            //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
-            //divide into chunks and index
-            return Ingester.getDefault().indexText(extractor, aFile, context);
         }
 
         /**

From 1ce768a8617cbab3eb30aa0dbc040714b51a42f9 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Mon, 3 Dec 2018 14:18:10 -0500
Subject: [PATCH 03/18] Migrated all tika dependencies from keywordsearch NBM
 to core

---
 Core/nbproject/project.properties |  52 +++++-
 Core/nbproject/project.xml        | 300 ++++++++++++++++++++++--------
 2 files changed, 261 insertions(+), 91 deletions(-)

diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties
index 0833626f85..3cccdceb8e 100644
--- a/Core/nbproject/project.properties
+++ b/Core/nbproject/project.properties
@@ -1,28 +1,59 @@
 file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar
-file.reference.apache-mime4j-core-0.8.1.jar=C:\\cygwin64\\home\\dsmyda\\autopsy\\Core\\release\\modules\\ext\\apache-mime4j-core-0.8.1.jar
-file.reference.apache-mime4j-dom-0.8.1.jar=C:\\cygwin64\\home\\dsmyda\\autopsy\\Core\\release\\modules\\ext\\apache-mime4j-dom-0.8.1.jar
+file.reference.apache-mime4j-core-0.8.1.jar=release/modules/ext/apache-mime4j-core-0.8.1.jar
+file.reference.apache-mime4j-dom-0.8.1.jar=release/modules/ext/apache-mime4j-dom-0.8.1.jar
+file.reference.asm-5.0.4.jar=release/modules/ext/asm-5.0.4.jar
+file.reference.bcmail-jdk15on-1.54.jar=release/modules/ext/bcmail-jdk15on-1.54.jar
+file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
+file.reference.boilerpipe-1.1.0.jar=release/modules/ext/boilerpipe-1.1.0.jar
 file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar
+file.reference.cdm-4.5.5.jar=release/modules/ext/cdm-4.5.5.jar
+file.reference.commons-codec-1.6.jar=release/modules/ext/commons-codec-1.6.jar
 file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar
-file.reference.commons-dbcp2-2.1.1.jar=release\\modules\\ext\\commons-dbcp2-2.1.1.jar
-file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4.2.jar
+file.reference.commons-dbcp2-2.1.1.jar=release/modules/ext/commons-dbcp2-2.1.1.jar
+file.reference.commons-io-2.5.jar=release/modules/ext/commons-io-2.5.jar
+file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
 file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar
+file.reference.geoapi-3.0.0.jar=release/modules/ext/geoapi-3.0.0.jar
+file.reference.grib-4.5.5.jar=release/modules/ext/grib-4.5.5.jar
+file.reference.gson-2.8.1.jar=release/modules/ext/gson-2.8.1.jar
+file.reference.httpservices-4.5.5.jar=release/modules/ext/httpservices-4.5.5.jar
+file.reference.isoparser-1.1.18.jar=release/modules/ext/isoparser-1.1.18.jar
+file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
+file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
+file.reference.jackson-core-2.9.2.jar=release/modules/ext/jackson-core-2.9.2.jar
+file.reference.java-libpst-0.8.1.jar=release/modules/ext/java-libpst-0.8.1.jar
+file.reference.jcl-over-slf4j-1.7.24.jar=release/modules/ext/jcl-over-slf4j-1.7.24.jar
 file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar
 file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar
-file.reference.jericho-html-3.3.jar=release\\modules\\ext\\jericho-html-3.3.jar
+file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
 file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar
+file.reference.jhighlight-1.0.2.jar=release/modules/ext/jhighlight-1.0.2.jar
+file.reference.jmatio-1.2.jar=release/modules/ext/jmatio-1.2.jar
+file.reference.json-1.8.jar=release/modules/ext/json-1.8.jar
+file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
 file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar
+file.reference.jul-to-slf4j-1.7.24.jar=release/modules/ext/jul-to-slf4j-1.7.24.jar
+file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
+file.reference.junrar-0.7.jar=release/modules/ext/junrar-0.7.jar
 file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
 file.reference.mchange-commons-java-0.2.9.jar=release/modules/ext/mchange-commons-java-0.2.9.jar
 file.reference.metadata-extractor-2.10.1.jar=release/modules/ext/metadata-extractor-2.10.1.jar
+file.reference.netcdf4-4.5.5.jar=release/modules/ext/netcdf4-4.5.5.jar
+file.reference.opennlp-tools-1.8.3.jar=release/modules/ext/opennlp-tools-1.8.3.jar
+file.reference.poi-3.17.jar=release/modules/ext/poi-3.17.jar
+file.reference.poi-ooxml-3.17.jar=release/modules/ext/poi-ooxml-3.17.jar
+file.reference.poi-scratchpad-3.17.jar=release/modules/ext/poi-scratchpad-3.17.jar
 file.reference.postgresql-9.4.1211.jre7.jar=release/modules/ext/postgresql-9.4.1211.jre7.jar
 file.reference.Rejistry-1.0-SNAPSHOT.jar=release/modules/ext/Rejistry-1.0-SNAPSHOT.jar
+file.reference.rome-1.5.1.jar=release/modules/ext/rome-1.5.1.jar
 file.reference.sevenzipjbinding-AllPlatforms.jar=release/modules/ext/sevenzipjbinding-AllPlatforms.jar
 file.reference.sevenzipjbinding.jar=release/modules/ext/sevenzipjbinding.jar
-file.reference.sqlite-jdbc-3.8.11.jar=release\\modules\\ext\\sqlite-jdbc-3.8.11.jar
+file.reference.sis-metadata-0.6.jar=release/modules/ext/sis-metadata-0.6.jar
+file.reference.sis-netcdf-0.6.jar=release/modules/ext/sis-netcdf-0.6.jar
+file.reference.sis-utility-0.6.jar=release/modules/ext/sis-utility-0.6.jar
+file.reference.slf4j-api-1.7.24.jar=release/modules/ext/slf4j-api-1.7.24.jar
+file.reference.sqlite-jdbc-3.8.11.jar=release/modules/ext/sqlite-jdbc-3.8.11.jar
 file.reference.StixLib.jar=release/modules/ext/StixLib.jar
-file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
-file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
-file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
 file.reference.jempbox-1.8.13.jar=release/modules/ext/jempbox-1.8.13.jar
 file.reference.javax.ws.rs-api-2.0.1.jar=release/modules/ext/javax.ws.rs-api-2.0.1.jar
 file.reference.cxf-core-3.0.16.jar=release/modules/ext/cxf-core-3.0.16.jar
@@ -33,11 +64,14 @@ file.reference.fontbox-2.0.8.jar=release/modules/ext/fontbox-2.0.8.jar
 file.reference.pdfbox-2.0.8.jar=release/modules/ext/pdfbox-2.0.8.jar
 file.reference.pdfbox-tools-2.0.8.jar=release/modules/ext/pdfbox-tools-2.0.8.jar
 file.reference.sleuthkit-postgresql-4.6.4.jar=release/modules/ext/sleuthkit-postgresql-4.6.4.jar
+file.reference.tagsoup-1.2.1.jar=release/modules/ext/tagsoup-1.2.1.jar
 file.reference.tika-core-1.17.jar=release/modules/ext/tika-core-1.17.jar
 file.reference.tika-parsers-1.17.jar=release/modules/ext/tika-parsers-1.17.jar
 file.reference.curator-client-2.8.0.jar=release/modules/ext/curator-client-2.8.0.jar
 file.reference.curator-framework-2.8.0.jar=release/modules/ext/curator-framework-2.8.0.jar
 file.reference.curator-recipes-2.8.0.jar=release/modules/ext/curator-recipes-2.8.0.jar
+file.reference.vorbis-java-core-0.8.jar=release/modules/ext/vorbis-java-core-0.8.jar
+file.reference.vorbis-java-tika-0.8.jar=release/modules/ext/vorbis-java-tika-0.8.jar
 file.reference.xmpcore-5.1.3.jar=release/modules/ext/xmpcore-5.1.3.jar
 file.reference.xz-1.6.jar=release/modules/ext/xz-1.6.jar
 file.reference.zookeeper-3.4.6.jar=release/modules/ext/zookeeper-3.4.6.jar
diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml
index 2cc68bf73b..bc9de174b5 100644
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@@ -343,90 +343,54 @@
                 <package>org.sleuthkit.autopsy.texttranslation</package>
                 <package>org.sleuthkit.datamodel</package>
             </public-packages>
-            <class-path-extension>
-                <runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/apache-mime4j-dom-0.8.1.jar</runtime-relative-path>
-                <binary-origin>C:\cygwin64\home\dsmyda\autopsy\Core\release\modules\ext\apache-mime4j-dom-0.8.1.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/jdom-2.0.5.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/jdom-2.0.5.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/cxf-rt-transports-http-3.0.16.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/cxf-rt-transports-http-3.0.16.jar</binary-origin>
+                <binary-origin>release/modules/ext/apache-mime4j-dom-0.8.1.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/jackcess-2.2.0.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/jackcess-2.2.0.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
-                <runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin>
+                <runtime-relative-path>ext/jericho-html-3.3.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jericho-html-3.3.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
-                <runtime-relative-path>ext/jericho-html-3.3.jar</runtime-relative-path>
-                <binary-origin>release\modules\ext\jericho-html-3.3.jar</binary-origin>
+                <runtime-relative-path>ext/cdm-4.5.5.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/cdm-4.5.5.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/httpservices-4.5.5.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/httpservices-4.5.5.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/commons-validator-1.6.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/commons-validator-1.6.jar</binary-origin>
             </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/curator-framework-2.8.0.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/curator-framework-2.8.0.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/bcprov-jdk15on-1.54.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/bcprov-jdk15on-1.54.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/commons-compress-1.14.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/commons-compress-1.14.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
-                <runtime-relative-path>ext/fontbox-2.0.8.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/fontbox-2.0.8.jar</binary-origin>
+                <runtime-relative-path>ext/geoapi-3.0.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/geoapi-3.0.0.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
-                <runtime-relative-path>ext/commons-dbcp2-2.1.1.jar</runtime-relative-path>
-                <binary-origin>release\modules\ext\commons-dbcp2-2.1.1.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/jgraphx-v3.8.0.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/jgraphx-v3.8.0.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/jython-standalone-2.7.0.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/jython-standalone-2.7.0.jar</binary-origin>
+                <runtime-relative-path>ext/boilerpipe-1.1.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/boilerpipe-1.1.0.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/sevenzipjbinding.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/sevenzipjbinding.jar</binary-origin>
             </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/bcmail-jdk15on-1.54.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/bcmail-jdk15on-1.54.jar</binary-origin>
+            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/mchange-commons-java-0.2.9.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/mchange-commons-java-0.2.9.jar</binary-origin>
             </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/cxf-core-3.0.16.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/cxf-core-3.0.16.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/javax.ws.rs-api-2.0.1.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/javax.ws.rs-api-2.0.1.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/postgresql-9.4.1211.jre7.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/postgresql-9.4.1211.jre7.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/curator-recipes-2.8.0.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/curator-recipes-2.8.0.jar</binary-origin>
@@ -437,7 +401,11 @@
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/apache-mime4j-core-0.8.1.jar</runtime-relative-path>
-                <binary-origin>C:\cygwin64\home\dsmyda\autopsy\Core\release\modules\ext\apache-mime4j-core-0.8.1.jar</binary-origin>
+                <binary-origin>release/modules/ext/apache-mime4j-core-0.8.1.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/tagsoup-1.2.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/tagsoup-1.2.1.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/tika-core-1.17.jar</runtime-relative-path>
@@ -447,41 +415,37 @@
                 <runtime-relative-path>ext/StixLib.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/StixLib.jar</binary-origin>
             </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/curator-client-2.8.0.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/curator-client-2.8.0.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/cxf-rt-frontend-jaxrs-3.0.16.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/pdfbox-tools-2.0.8.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/pdfbox-tools-2.0.8.jar</binary-origin>
             </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/asm-5.0.4.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/asm-5.0.4.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jcl-over-slf4j-1.7.24.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jcl-over-slf4j-1.7.24.jar</binary-origin>
+            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/tika-parsers-1.17.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/tika-parsers-1.17.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/sqlite-jdbc-3.8.11.jar</runtime-relative-path>
-                <binary-origin>release\modules\ext\sqlite-jdbc-3.8.11.jar</binary-origin>
+                <binary-origin>release/modules/ext/sqlite-jdbc-3.8.11.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
-                <runtime-relative-path>ext/activemq-all-5.11.1.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/activemq-all-5.11.1.jar</binary-origin>
+                <runtime-relative-path>ext/json-simple-1.1.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/json-simple-1.1.1.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
-                <runtime-relative-path>ext/xz-1.6.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/xz-1.6.jar</binary-origin>
+                <runtime-relative-path>ext/sis-utility-0.6.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/sis-utility-0.6.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
-                <runtime-relative-path>ext/Rejistry-1.0-SNAPSHOT.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/Rejistry-1.0-SNAPSHOT.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/dd-plist-1.20.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/dd-plist-1.20.jar</binary-origin>
+                <runtime-relative-path>ext/jhighlight-1.0.2.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jhighlight-1.0.2.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/jempbox-1.8.13.jar</runtime-relative-path>
@@ -491,17 +455,9 @@
                 <runtime-relative-path>ext/cxf-rt-rs-client-3.0.16.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/cxf-rt-rs-client-3.0.16.jar</binary-origin>
             </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/sevenzipjbinding-AllPlatforms.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/sevenzipjbinding-AllPlatforms.jar</binary-origin>
-            </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/commons-pool2-2.4.2.jar</runtime-relative-path>
-                <binary-origin>release\modules\ext\commons-pool2-2.4.2.jar</binary-origin>
-            </class-path-extension>
-            <class-path-extension>
-                <runtime-relative-path>ext/jsoup-1.10.3.jar</runtime-relative-path>
-                <binary-origin>release/modules/ext/jsoup-1.10.3.jar</binary-origin>
+                <binary-origin>release/modules/ext/commons-pool2-2.4.2.jar</binary-origin>
             </class-path-extension>
             <class-path-extension>
                 <runtime-relative-path>ext/jdom-2.0.5-contrib.jar</runtime-relative-path>
@@ -523,6 +479,186 @@
                 <runtime-relative-path>ext/xmpcore-5.1.3.jar</runtime-relative-path>
                 <binary-origin>release/modules/ext/xmpcore-5.1.3.jar</binary-origin>
             </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jdom-2.0.5.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jdom-2.0.5.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/cxf-rt-transports-http-3.0.16.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/cxf-rt-transports-http-3.0.16.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/sis-metadata-0.6.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/sis-metadata-0.6.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/isoparser-1.1.18.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/isoparser-1.1.18.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/vorbis-java-core-0.8.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/vorbis-java-core-0.8.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/commons-codec-1.6.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/commons-codec-1.6.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/netcdf4-4.5.5.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/netcdf4-4.5.5.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/slf4j-api-1.7.24.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/slf4j-api-1.7.24.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/java-libpst-0.8.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/java-libpst-0.8.1.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jul-to-slf4j-1.7.24.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jul-to-slf4j-1.7.24.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/gson-2.8.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/gson-2.8.1.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/poi-3.17.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/poi-3.17.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/poi-scratchpad-3.17.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/poi-scratchpad-3.17.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/sis-netcdf-0.6.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/sis-netcdf-0.6.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/commons-io-2.5.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/commons-io-2.5.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/curator-framework-2.8.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/curator-framework-2.8.0.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/bcprov-jdk15on-1.54.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/bcprov-jdk15on-1.54.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/fontbox-2.0.8.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/fontbox-2.0.8.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/commons-dbcp2-2.1.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/commons-dbcp2-2.1.1.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jgraphx-v3.8.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jgraphx-v3.8.0.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/juniversalchardet-1.0.3.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/juniversalchardet-1.0.3.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jython-standalone-2.7.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jython-standalone-2.7.0.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/cxf-core-3.0.16.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/cxf-core-3.0.16.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/javax.ws.rs-api-2.0.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/javax.ws.rs-api-2.0.1.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/opennlp-tools-1.8.3.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/opennlp-tools-1.8.3.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/junrar-0.7.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/junrar-0.7.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/postgresql-9.4.1211.jre7.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/postgresql-9.4.1211.jre7.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/poi-ooxml-3.17.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/poi-ooxml-3.17.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/curator-client-2.8.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/curator-client-2.8.0.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/cxf-rt-frontend-jaxrs-3.0.16.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/grib-4.5.5.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/grib-4.5.5.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jackson-core-2.9.2.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jackson-core-2.9.2.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/activemq-all-5.11.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/activemq-all-5.11.1.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/xz-1.6.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/xz-1.6.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/Rejistry-1.0-SNAPSHOT.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/Rejistry-1.0-SNAPSHOT.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/dd-plist-1.20.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/dd-plist-1.20.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/rome-1.5.1.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/rome-1.5.1.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/sevenzipjbinding-AllPlatforms.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/sevenzipjbinding-AllPlatforms.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jmatio-1.2.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jmatio-1.2.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jsoup-1.10.3.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jsoup-1.10.3.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/vorbis-java-tika-0.8.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/json-1.8.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/json-1.8.jar</binary-origin>
+            </class-path-extension>
         </data>
     </configuration>
 </project>

From 982ea74a96f96592d5f30addaee896493d8a0a0d Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Tue, 4 Dec 2018 10:59:10 -0500
Subject: [PATCH 04/18] Cleaned up some unncessary config files and added a
 heap of commments

---
 .../textextractors/ContentTextExtractor.java  |  44 ++++++-
 .../textextractors/ExtractionContext.java     |  60 +++++++--
 .../textextractors/HtmlTextExtractor.java     |  66 ++++++++--
 .../textextractors/SqliteTextExtractor.java   | 116 ++++++++++-------
 .../textextractors/StringsTextExtractor.java  |  86 +++++++++----
 .../autopsy/textextractors/TextExtractor.java |  33 ++---
 .../textextractors/TextExtractorFactory.java  | 119 +++++++++++++-----
 .../textextractors/TikaTextExtractor.java     |  76 ++++++++---
 .../HTMLExtractionConfig.java                 |  22 ----
 .../ImageFileExtractionConfig.java            |  35 +++++-
 .../StringsExtractionConfig.java              |  56 ++++++++-
 .../TextFileExtractionConfig.java             |  22 ----
 .../texttranslation/TranslationException.java |   2 +-
 .../KeywordSearchIngestModule.java            |   1 -
 .../keywordsearch/TextFileExtractor.java      |   6 -
 15 files changed, 525 insertions(+), 219 deletions(-)
 delete mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java
 delete mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
index b97c1e3ab3..54f55d6a86 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
@@ -29,6 +29,8 @@ import org.sleuthkit.datamodel.Content;
  */
 public abstract class ContentTextExtractor implements TextExtractor<Content> {
     
+    //Mimetype groups to aassist extractor implementations in ignoring binary and 
+    //archive files.
     static final List<String> BINARY_MIME_TYPES
             = Arrays.asList(
                     //ignore binary blob data, for which string extraction will be used
@@ -72,6 +74,24 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
                     "application/x-z", //NON-NLS
                     "application/x-compress"); //NON-NLS
 
+    /**
+     * Accepts content instance to allow for extractor configuration on specific 
+     * file types. 
+     * 
+     * See extractionconfigs package for available file type configurations.
+     * 
+     * @param context Instance that contains config classes 
+     */
+    public ContentTextExtractor(ExtractionContext context) {
+        
+    }
+    
+    /**
+     * Default constructor to create extractor instances with default configurations.
+     */
+    public ContentTextExtractor() {
+    }
+    
     /**
      * Determines if the extractor works only for specified types is
      * supportedTypes() or whether is a generic content extractor (such as
@@ -85,25 +105,41 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
      * Determines if the file content is supported by the extractor if
      * isContentTypeSpecific() returns true.
      *
-     * @param content           to test if its content should be supported
+     * @param file           to test if its content should be supported
      * @param detectedFormat mime-type with detected format (such as text/plain)
      *                       or null if not detected
      *
      * @return true if the file content is supported, false otherwise
      */
     public abstract boolean isSupported(Content file, String detectedFormat);
-    
-    public abstract void parseContext(ExtractionContext context);
 
+    /**
+     * Returns a reader that will iterate over the text of the source content.
+     * 
+     * @param source Content source to read
+     * @return A reader that contains all source text.
+     * @throws TextExtractorException Error encountered during extraction
+     */
     @Override
     public abstract Reader getReader(Content source) throws TextExtractorException;
 
+    /**
+     * Get the object id of the content source.
+     * 
+     * @param source source content
+     * @return object id associated with this source content
+     */
     @Override
     public long getID(Content source) {
         return source.getId();
     }
 
-
+    /**
+     * Returns the human-readable name of the given content source.
+     * 
+     * @param source source content
+     * @return name of source content
+     */
     @Override
     public String getName(Content source) {
         return source.getName();
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
index d291164a2e..03c33c1ad9 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
@@ -1,7 +1,20 @@
 /*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2018 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.sleuthkit.autopsy.textextractors;
 
@@ -9,20 +22,51 @@ import com.google.common.collect.ClassToInstanceMap;
 import com.google.common.collect.MutableClassToInstanceMap;
 
 /**
+ * Stores extraction config instances for media types.
  *
- * @author dsmyda
+ * TextExtractors parse this class when configuring their own extraction
+ * settings.
  */
 public class ExtractionContext {
+
     ClassToInstanceMap<Object> extractionConfigs;
-    
+
     public ExtractionContext() {
         extractionConfigs = MutableClassToInstanceMap.create();
     }
-        
+
+    /**
+     * Internally stores a class-instance pair.
+     *
+     * @param <T>            Class type that will be stored.
+     * @param configClass    The class object of the instance
+     * @param configInstance Config instance of type T
+     */
     public <T> void set(Class<T> configClass, T configInstance) {
         extractionConfigs.put(configClass, configInstance);
-    }	
+    }
+
+    /**
+     * Retrieves the config instance associated with this key.
+     *
+     * @param <T>         Type of the stored instance
+     * @param configClass The class object of the instance
+     *
+     * @return The config instance of type T
+     */
     public <T> T get(Class<T> configClass) {
-        return  (T) extractionConfigs.get(configClass);
+        return configClass.cast(extractionConfigs.get(configClass));
+    }
+
+    /**
+     * Indicates if this class key has been stored.
+     *
+     * @param <T>         Type of the stored instance
+     * @param configClass The class object of the instance
+     *
+     * @return flag indicating the presense of this instance
+     */
+    public <T> boolean contains(Class<T> configClass) {
+        return get(configClass) != null;
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index f834359514..6f9088bfc1 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -23,7 +23,6 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
-import java.util.Objects;
 import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Config;
@@ -33,17 +32,16 @@ import net.htmlparser.jericho.Source;
 import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.textextractors.extractionconfigs.HTMLExtractionConfig;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
  * Extracts text from HTML content.
  */
-public class HtmlTextExtractor extends ContentTextExtractor {
+public final class HtmlTextExtractor extends ContentTextExtractor {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
-    private int maxSize = 50_000_000;
+    private int maxSize;
 
     static final List<String> WEB_MIME_TYPES = Arrays.asList(
             "application/javascript", //NON-NLS
@@ -59,21 +57,66 @@ public class HtmlTextExtractor extends ContentTextExtractor {
         Config.LoggerProvider = LoggerProvider.DISABLED;
     }
 
+    /**
+     * Configures the extractor to use the settings HTMLExtractionConfig instance
+     * stored in the ExtractionContext object. 
+     * 
+     * As of now, there are no configurable features of this extractor.
+     * 
+     * @param context Instance containing config classes
+     */
+    public HtmlTextExtractor(ExtractionContext context) {
+        this();
+    }
+    
+    /**
+     * Creates a default instance of HtmlTextExtractor. Supported file size
+     * is 50MB.
+     */
+    public HtmlTextExtractor() {
+        //Set default to be 50 MB.
+        maxSize = 50_000_000;
+    }
+
+    /**
+     * Determines if this extractor is responsible for extracting only a specific 
+     * type of media.
+     * 
+     * In this case, only HTML documents can be read successfully.
+     * 
+     * @return true 
+     */
     @Override
     public boolean isContentTypeSpecific() {
         return true;
     }
 
+    /**
+     * Determines if this content type is supported by this extractor.
+     * 
+     * @param content Content instance to be analyzed
+     * @param detectedFormat Mimetype of content instance
+     * @return flag indicating supporting
+     */
     @Override
     public boolean isSupported(Content content, String detectedFormat) {
         boolean notNull = detectedFormat != null;
         boolean supported = WEB_MIME_TYPES.contains(detectedFormat);
-        boolean size = content.getSize() <= maxSize;;
+        boolean size = content.getSize() <= maxSize;
         return notNull && supported && size;
     }
 
+    /**
+     * Returns a reader that will iterate over the text of an Html document.
+     * 
+     * @param content Html document source
+     * @return A reader instance containing the document source text
+     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
+     */
     @Override
     public Reader getReader(Content content) throws TextExtractorException {
+        //TODO JIRA-4467, there is only harm in excluding HTML documents greater
+        //than 50MB due to our troubled approach of extraction.
         ReadContentInputStream stream = new ReadContentInputStream(content);
 
         //Parse the stream with Jericho and put the results in a Reader
@@ -171,6 +214,11 @@ public class HtmlTextExtractor extends ContentTextExtractor {
         }
     }
 
+    /**
+     * Indicates if this extractor can run.
+     * 
+     * @return Flag indicating if this extractor can run. 
+     */
     @Override
     public boolean isDisabled() {
         return false;
@@ -180,12 +228,4 @@ public class HtmlTextExtractor extends ContentTextExtractor {
     public void logWarning(final String msg, Exception ex) {
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
-
-    @Override
-    public void parseContext(ExtractionContext context) {
-        HTMLExtractionConfig configInstance = context.get(HTMLExtractionConfig.class);
-        if(Objects.nonNull(configInstance)) {
-            this.maxSize = configInstance.getContentSizeLimit();
-        }
-    }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index 68355756f0..a57654b4b9 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -1,20 +1,20 @@
-/*	
- * Autopsy Forensic Browser	
- *	
- * Copyright 2018-2018 Basis Technology Corp.	
- * Contact: carrier <at> sleuthkit <dot> org	
- *	
- * Licensed under the Apache License, Version 2.0 (the "License");	
- * you may not use this file except in compliance with the License.	
- * You may obtain a copy of the License at	
- *	
- *     http://www.apache.org/licenses/LICENSE-2.0	
- *	
- * Unless required by applicable law or agreed to in writing, software	
- * distributed under the License is distributed on an "AS IS" BASIS,	
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.	
- * See the License for the specific language governing permissions and	
- * limitations under the License.	
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2018 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.sleuthkit.autopsy.textextractors;
 
@@ -32,26 +32,62 @@ import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.AbstractFile;
 
 /**
- * Dedicated SqliteTextExtractor to solve the problems associated with Tika's
- * Sqlite parser.
+ * Extracts text from SQLite database files.
  *
- * Tika problems: 1) Tika fails to open virtual tables 2) Tika fails to open
- * tables with spaces in table name 3) Tika fails to include the table names in
- * output (except for the first table it parses)
+ * This is a dedicated solution to address the problems associated with 
+ * Tika's sqlite parser (version 1.17), which include the following:
+ *  1) Virtual tables cause the parser to bail
+ *  2) Tables that contain spaces in their name are not extracted
+ *  3) Table names are not included in its output text
  */
-public class SqliteTextExtractor extends ContentTextExtractor {
+public final class SqliteTextExtractor extends ContentTextExtractor {
 
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
 
+    /**
+     * Accepts a context instance for run-time configuration.
+     * 
+     * As of now, this constructor is a no-op as it does not support 
+     * any type of configuration.
+     * 
+     * @param context Instance that contains config classes
+     */
+    public SqliteTextExtractor(ExtractionContext context) {
+    }
+    
+    /**
+     * Creates a default SqliteTextExtractor instance.
+     */
+    public SqliteTextExtractor() {
+    }
+
+    /**
+     * This extractor only works for sqlite files, so it is indeed content type
+     * specific. 
+     * 
+     * @return true
+     */
     @Override
     public boolean isContentTypeSpecific() {
         return true;
     }
 
+    /**
+     * Determines if this extractor is fit to run.
+     * 
+     * @return Flag indicating if it should or shouldn't be run.
+     */
     @Override
     public boolean isDisabled() {
-        return false;
+        try {
+            Class.forName("org.sqlite.JDBC");
+            return false;
+        } catch (ClassNotFoundException ex) {
+            logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
+                    + "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
+            return true;
+        }
     }
 
     @Override
@@ -73,14 +109,14 @@ public class SqliteTextExtractor extends ContentTextExtractor {
     }
 
     /**
-     * Returns a stream that will read from a sqlite database.
+     * Returns a reader that will iterate over the text of a sqlite database.
      *
      * @param source Content file
      *
      * @return An InputStream that reads from a Sqlite database.
      *
      * @throws
-     * org.sleuthkit.autopsy.keywordsearch.TextExtractor.TextExtractorException
+     * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
     @Override
     public Reader getReader(Content source) throws TextExtractorException {
@@ -89,28 +125,23 @@ public class SqliteTextExtractor extends ContentTextExtractor {
             try {
                 return CharSource.wrap("").openStream();
             } catch (IOException ex) {
-                throw new TextExtractorException("", ex);
+                throw new TextExtractorException("Could not open CharSource stream", ex);
             }
         }
 
         return new SQLiteStreamReader((AbstractFile) source);
     }
 
-    @Override
-    public void parseContext(ExtractionContext context) {
-        //No settings.
-    }
-
     /**
      * Produces a continuous stream of characters from a database file. To
      * achieve this, all table names are queues up and a SQLiteTableReader is
      * used to do the actual queries and table iteration.
      */
-    public class SQLiteStreamReader extends Reader {
+    private class SQLiteStreamReader extends Reader {
 
         private final SQLiteTableReader reader;
         private final AbstractFile file;
-        
+
         private Iterator<String> tableNames;
         private String currentTableName;
 
@@ -222,9 +253,10 @@ public class SqliteTextExtractor extends ContentTextExtractor {
         }
 
         /**
-         * Reads database values into the buffer. This function is responsible for 
-         * getting the next table in the queue, initiating calls to the SQLiteTableReader,
-         * and filling in any excess bytes that are lingering from the previous call.
+         * Reads database values into the buffer. This function is responsible
+         * for getting the next table in the queue, initiating calls to the
+         * SQLiteTableReader, and filling in any excess bytes that are lingering
+         * from the previous call.
          *
          * @throws IOException
          */
@@ -260,9 +292,9 @@ public class SqliteTextExtractor extends ContentTextExtractor {
                             reader.read(currentTableName, () -> bufIndex == len);
                         } catch (SQLiteTableReaderException ex) {
                             logger.log(Level.WARNING, String.format(
-                                "Error attempting to read file table: [%s]" //NON-NLS
-                                + " for file: [%s] (id=%d).", currentTableName, //NON-NLS
-                                file.getName(), file.getId()), ex.getMessage());
+                                    "Error attempting to read file table: [%s]" //NON-NLS
+                                    + " for file: [%s] (id=%d).", currentTableName, //NON-NLS
+                                    file.getName(), file.getId()), ex.getMessage());
                         }
                     } else {
                         if (bufIndex == off) {
@@ -295,8 +327,8 @@ public class SqliteTextExtractor extends ContentTextExtractor {
         }
 
         /**
-         * Wrapper that holds the excess bytes that were left over from the previous
-         * call to read().
+         * Wrapper that holds the excess bytes that were left over from the
+         * previous call to read().
          */
         private class ExcessBytes {
 
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index a3647197c6..450868e2e0 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -24,6 +24,7 @@ import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Objects;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
@@ -36,8 +37,9 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-public class StringsTextExtractor extends ContentTextExtractor {
-    
+public final class StringsTextExtractor extends ContentTextExtractor {
+      
+    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private boolean extractUTF8;
     private boolean extractUTF16;
     
@@ -49,23 +51,28 @@ public class StringsTextExtractor extends ContentTextExtractor {
         EXTRACT_UTF8, ///< extract UTF8 text, true/false
     };
 
-    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
-
-    @Override
-    public void parseContext(ExtractionContext context) {
-        StringsExtractionConfig configInstance = context.get(StringsExtractionConfig.class);
-        if(configInstance != null) {
-            extractUTF8 = configInstance.getExtractUTF8();
-            extractUTF16 = configInstance.getExtractUTF16();
-            setScripts(configInstance.getExtractScripts());
-        }
-    }
-
+    /**
+     * Determines if this extractor may only read particular types of content.
+     * 
+     * Since Strings may be run on any content type, it is not content specific.
+     * 
+     * @return 
+     */
     @Override
     public boolean isContentTypeSpecific() {
-        return true;
+        return false;
     }
 
+    /**
+     * Determines if this extractor can read the content type. 
+     * 
+     * Note: Strings can be run on any type of content, so all types
+     * will return true;
+     * 
+     * @param file Content source to read
+     * @param detectedFormat Mimetype of source file.
+     * @return true
+     */
     @Override
     public boolean isSupported(Content file, String detectedFormat) {
         return true;
@@ -73,9 +80,34 @@ public class StringsTextExtractor extends ContentTextExtractor {
 
     private final List<SCRIPT> extractScripts = new ArrayList<>();
     
+    /**
+     * Accepts a context instance for run-time configuration.
+     * 
+     * See StringsExtractionConfig.java for available extraction settings. 
+     * 
+     * @param context Instance that contains config classes.
+     */
+    public StringsTextExtractor(ExtractionContext context) {
+        this();
+        if(context != null && context.contains(StringsExtractionConfig.class)) {
+            StringsExtractionConfig configInstance = context.get(StringsExtractionConfig.class);
+            extractUTF8 = configInstance.getExtractUTF8();
+            extractUTF16 = configInstance.getExtractUTF16();
+            if(Objects.nonNull(configInstance.getExtractScripts())) {
+                setScripts(configInstance.getExtractScripts());
+            }
+        }
+    }
+    
+    /**
+     * Creates a default StringsTextExtractor instance. The instance will be 
+     * configured to run only LATIN_2 as its default extraction script and 
+     * UTF-8 as its default encoding.
+     */
     public StringsTextExtractor() {
         //LATIN_2 is the default script
         extractScripts.add(SCRIPT.LATIN_2);
+        extractUTF8 = true;
     }
 
     /**
@@ -83,7 +115,7 @@ public class StringsTextExtractor extends ContentTextExtractor {
      *
      * @param extractScripts scripts to use
      */
-    public void setScripts(List<SCRIPT> extractScripts) {
+    public final void setScripts(List<SCRIPT> extractScripts) {
         this.extractScripts.clear();
         this.extractScripts.addAll(extractScripts);
     }
@@ -102,11 +134,26 @@ public class StringsTextExtractor extends ContentTextExtractor {
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
 
+    /**
+     * Determines if this extractor should be run or not.
+     * 
+     * Atleast one of the extraction encodings in StringsExtractionConfig must 
+     * be set for this extractor to run.
+     * 
+     * @return Flag indicating if this extractor should be run.
+     */
     @Override
     public boolean isDisabled() {
         return extractUTF8 == false && extractUTF16 == false;
     }
 
+    /**
+     * Returns a reader that will iterate over the text of the content source.
+     * 
+     * @param content Content source of any type
+     * @return A reader instance that content text can be obtained from
+     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
+     */
     @Override
     public InputStreamReader getReader(Content content) throws TextExtractorException {
         InputStream stringStream = getInputStream(content);
@@ -230,12 +277,7 @@ public class StringsTextExtractor extends ContentTextExtractor {
                 }
                 //get char from cur read buf
                 char c = (char) curReadBuf[readBufOffset++];
-                if (c == 0 && singleConsecZero == false) {
-                    //preserve the current sequence if max consec. 1 zero char
-                    singleConsecZero = true;
-                } else {
-                    singleConsecZero = false;
-                }
+                singleConsecZero = c == 0 && singleConsecZero == false; //preserve the current sequence if max consec. 1 zero char
                 if (StringExtract.isPrintableAscii(c)) {
                     tempString.append(c);
                     ++tempStringLen;
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index 1efed82c05..598a4cef20 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2011-16 Basis Technology Corp.
+ * Copyright 2011-18 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,11 +25,11 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
  * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
  * This Reader is given to the Ingester to chunk and index in Solr.
  *
- * @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
+ * @param <T> The subtype of SleuthkitVisitableItem an implementation
  *                     is able to process.
  */
-public interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
-
+public interface TextExtractor<T extends SleuthkitVisitableItem> {
+    
     /**
      * Is this extractor configured such that no extraction will/should be done?
      *
@@ -40,39 +40,42 @@ public interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
     /**
      * Log the given message and exception as a warning.
      *
-     * @param msg
-     * @param ex
+     * @param msg Log message
+     * @param ex Exception associated with the incoming message
      */
     abstract void logWarning(String msg, Exception ex);
 
     /**
-     * Get a reader that over the text extracted from the given source.
+     * Get a reader that will iterate over the text extracted from the given source.
      *
-     * @param source
+     * @param source 
      *
      * @return
      * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
-    abstract Reader getReader(TextSource source) throws TextExtractorException;
+    abstract Reader getReader(T source) throws TextExtractorException;
 
     /**
      * Get the 'object' id of the given source.
      *
-     * @param source
+     * @param source Source content of type T
      *
-     * @return
+     * @return Object id of the source content
      */
-    abstract long getID(TextSource source);
+    abstract long getID(T source);
 
     /**
      * Get a human readable name for the given source.
      *
-     * @param source
+     * @param source Source content of type T
      *
-     * @return
+     * @return Name of the content source
      */
-    abstract String getName(TextSource source);
+    abstract String getName(T source);
 
+    /**
+     * System exception for dealing with errors encountered during extraction.
+     */
     class TextExtractorException extends Exception {
 
         public TextExtractorException(String message) {
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index f61732ad97..48c9f3f9bb 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -1,57 +1,110 @@
 /*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2018 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.sleuthkit.autopsy.textextractors;
 
 import com.google.common.collect.ImmutableList;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.logging.Level;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 
 /**
+ * Factory for creating text extractors given a source file and a mimetype.
  *
- * @author dsmyda
+ * See ContentTextExtractor.java for the generic structure of such extractors.
  */
 public class TextExtractorFactory {
-    private static final ImmutableList<ContentTextExtractor> extractors = 
-                ImmutableList.of(new HtmlTextExtractor(), 
-                                 new SqliteTextExtractor(),
-                                 new TikaTextExtractor());
-        
+
+    private static final Logger logger = Logger.getLogger(TextExtractorFactory.class.getName());
+
     /**
-     * Auto detects the corrent text extractor given the file and mimetype. Context 
-     * 
-     * @param file
-     * @param mimeType
-     * @param context
-     * @return 
-     * @throws org.sleuthkit.autopsy.textextractors.TextExtractorFactory.NoSpecializedExtractorException 
+     * The order of these extractors is important. It is a must that more
+     * specialized solutions are placed before the TikaTextExtractor to ensure
+     * these solutions are chosen over Tika.
      */
-    public static ContentTextExtractor getSpecializedExtractor(AbstractFile file, 
+    private static final ImmutableList<Class<?>> extractors
+            = ImmutableList.of(HtmlTextExtractor.class,
+                    SqliteTextExtractor.class,
+                    TikaTextExtractor.class);
+
+    /**
+     * Auto detects the correct text extractor given the file and mimetype.
+     *
+     * TextExtractors can be configured using the ExtractionContext object.
+     * Passing in null or a new unmodified instance of ExtractionContext will
+     * keep the extractors at default settings. Refer to the extractionconfigs
+     * package for available file configurations.
+     *
+     * @param file     AbstractFile that will be read from
+     * @param mimeType Mimetype of source file
+     * @param context  Contains extraction configurations for certain file types
+     *
+     * @return A ContentTextExtractor instance that is properly configured and
+     *         can be read from the getReader() method.
+     *
+     * @throws NoSpecializedExtractorException In the event that the inputted
+     *                                         file and mimetype have no
+     *                                         corresponding extractor
+     */
+    public static ContentTextExtractor getSpecializedExtractor(AbstractFile file,
             String mimeType, ExtractionContext context) throws NoSpecializedExtractorException {
-        for(ContentTextExtractor candidate : extractors) {
-            candidate.parseContext(context);
-            if(candidate.isSupported(file, mimeType)) {
-                try {
-                    ContentTextExtractor extractorInstance = candidate.getClass().newInstance();
-                    extractorInstance.parseContext(context);
-                    return extractorInstance;
-                } catch (InstantiationException | IllegalAccessException ex) {
-                    
+        for (Class<?> candidate : extractors) {
+            try {
+                Constructor<?> constructor = candidate.getDeclaredConstructor(ExtractionContext.class);
+                constructor.setAccessible(true);
+                ContentTextExtractor newInstance = (ContentTextExtractor) constructor.newInstance(context);
+                if (newInstance.isSupported(file, mimeType)) {
+                    return newInstance;
                 }
+            } catch (NoSuchMethodException | SecurityException
+                    | InstantiationException | IllegalAccessException
+                    | IllegalArgumentException | InvocationTargetException ex) {
+                logger.log(Level.SEVERE, String.format("Could not instantiate ContentTextExtractor "
+                        + "instance for file %s, objId=%d and mimeType=%s", file.getName(),
+                        file.getId(), mimeType), ex);
             }
         }
-        
-        throw new NoSpecializedExtractorException("Could not find a suitable extractor for mimetype ["+mimeType+"]");
+
+        throw new NoSpecializedExtractorException("Could not find a suitable extractor for "
+                + "mimetype [" + mimeType + "]");
     }
-    
+
+    /**
+     * Returns the default extractor that can be run on any content type. This
+     * extractor should be used as a backup in the event that no specialized
+     * extractor can be found.
+     *
+     * @param context Contains extraction configurations for certain file types
+     *
+     * @return A StringsTextExtractor instance
+     */
     public static StringsTextExtractor getDefaultExtractor(ExtractionContext context) {
-        StringsTextExtractor instance = new StringsTextExtractor();
-        instance.parseContext(context);
-        return instance;
+        return new StringsTextExtractor(context);
     }
-    
+
+    /**
+     * System level exception for handling content types that have no specific 
+     * strategy defined for extracting their text.
+     */
     public static class NoSpecializedExtractorException extends Exception {
+
         public NoSpecializedExtractorException(String msg) {
             super(msg);
         }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 6ec09d2333..3549bb9a6e 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -25,14 +25,12 @@ import java.io.PushbackReader;
 import java.io.Reader;
 import java.nio.file.Paths;
 import java.util.List;
-import java.util.Objects;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
-import java.util.logging.Logger;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.tika.Tika;
@@ -47,7 +45,6 @@ import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.openide.util.NbBundle;
 import org.openide.modules.InstalledFileLocator;
 import org.sleuthkit.autopsy.coreutils.PlatformUtil;
-import org.sleuthkit.autopsy.ingest.IngestServices;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@@ -56,18 +53,15 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * Extracts text from Tika supported content. Protects against Tika
  * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
  */
-public class TikaTextExtractor extends ContentTextExtractor {
+public final class TikaTextExtractor extends ContentTextExtractor {
     
-    private boolean OCREnabled;
-    private final IngestServices services = IngestServices.getInstance();
-    private final Logger logger = services.getLogger(TikaTextExtractor.class.getName());
-
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
 
     private final AutoDetectParser parser = new AutoDetectParser();
     
+    private boolean tesseractOCREnabled;
     private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
     private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
     private static final File TESSERACT_PATH = locateTesseractExecutable();
@@ -78,11 +72,43 @@ public class TikaTextExtractor extends ContentTextExtractor {
                     .map(mt -> mt.getType() + "/" + mt.getSubtype())
                     .collect(Collectors.toList());
 
+    /**
+     * Accepts a context instance for run-time configuration.
+     * 
+     * The only configuration that is available to date is the 
+     * ImageFileExtractionConfig.java. You may refer to this class for
+     * supported settings.
+     * 
+     * @param context Instance that contains config classes
+     */
+    public TikaTextExtractor(ExtractionContext context) {
+        if(context != null && context.contains(ImageFileExtractionConfig.class)) {
+            ImageFileExtractionConfig configInstance = context.get(ImageFileExtractionConfig.class);
+            this.tesseractOCREnabled = configInstance.getOCREnabled();
+        }
+    }
+    
+    /**
+     * Creates a default TikaTextExtractor. OCR is turned off by default due to speed
+     * concerns.
+     */
+    public TikaTextExtractor() {    
+    }
+
     @Override
     public void logWarning(final String msg, Exception ex) {
         tikaLogger.log(Level.WARNING, msg, ex);
     }
 
+    /**
+     * Returns a reader that will iterate over the text extracted from Apache 
+     * Tika. 
+     * 
+     * @param content Supported source content to extract
+     * @return Reader that contains Apache Tika extracted text
+     * 
+     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
+     */
     @Override
     public Reader getReader(Content content) throws TextExtractorException {
         ReadContentInputStream stream = new ReadContentInputStream(content);
@@ -99,7 +125,7 @@ public class TikaTextExtractor extends ContentTextExtractor {
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
         
         // configure OCR if it is enabled in KWS settings and installed on the machine
-        if (TESSERACT_PATH != null && OCREnabled && PlatformUtil.isWindowsOS() == true) {
+        if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
             
             // configure PDFParser. 
             PDFParserConfig pdfConfig = new PDFParserConfig();
@@ -145,7 +171,6 @@ public class TikaTextExtractor extends ContentTextExtractor {
         } catch (TextExtractorException ex) {
             throw ex;
         } catch (Exception ex) {
-            logger.log(Level.SEVERE, "",ex);
             tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
             logWarning(msg, ex);
@@ -195,11 +220,26 @@ public class TikaTextExtractor extends ContentTextExtractor {
                         ));
     }
 
+    /**
+     * Determines if this extractor only understands a specifc type of content.
+     * 
+     * Although Apache Tika is defined for many input types, it is still a content
+     * specific approach to extraction.
+     * 
+     * @return true
+     */
     @Override
     public boolean isContentTypeSpecific() {
         return true;
     }
 
+    /**
+     * Determines if Tika is supported for this content type and mimetype.
+     * 
+     * @param content Source content to read
+     * @param detectedFormat Mimetype of content
+     * @return Flag indicating support for reading content type
+     */
     @Override
     public boolean isSupported(Content content, String detectedFormat) {
         if (detectedFormat == null
@@ -213,6 +253,14 @@ public class TikaTextExtractor extends ContentTextExtractor {
         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
     }
 
+    /**
+     * Determines if this extractor can be run.
+     * 
+     * So long as Tika's dependencies are present, this extractor can run 
+     * no matter the circumstance.
+     * 
+     * @return true 
+     */
     @Override
     public boolean isDisabled() {
         return false;
@@ -241,14 +289,6 @@ public class TikaTextExtractor extends ContentTextExtractor {
 
     }
 
-    @Override
-    public void parseContext(ExtractionContext context) {
-        ImageFileExtractionConfig configInstance = context.get(ImageFileExtractionConfig.class);
-        if(Objects.nonNull(configInstance)) {
-            this.OCREnabled = configInstance.getOCREnabled();
-        }
-    }
-
     /**
      * An implementation of CharSource that just wraps an existing reader and
      * returns it in openStream().
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java
deleted file mode 100755
index 6a15d57165..0000000000
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-package org.sleuthkit.autopsy.textextractors.extractionconfigs;
-
-/**
- *
- * @author dsmyda
- */
-public class HTMLExtractionConfig {
-    private int contentSizeLimit;
-        
-    public void setContentSizeLimit(int size) {
-        this.contentSizeLimit = size;
-    }
-
-    public int getContentSizeLimit() {
-        return this.contentSizeLimit;
-    }
-}
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
index 1aa320b84a..4f5a16220c 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
@@ -1,21 +1,44 @@
 /*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2018 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.sleuthkit.autopsy.textextractors.extractionconfigs;
 
 /**
- *
- * @author dsmyda
+ * Allows for configuration of image file extraction.
  */
 public class ImageFileExtractionConfig {
     private boolean OCREnabled;
-        
+    
+    /**
+     * Enables OCR to be run on the text extractor responsible for handling
+     * image files.
+     * 
+     * @param enabled Flag indicating if OCR is enabled.
+     */
     public void setOCREnabled(boolean enabled) {
         this.OCREnabled = enabled;
     }
 
+    /**
+     * Gets the OCR flag that has been set. By default this flag is turned off.
+     * 
+     * @return Flag indicating if OCR is enabled.
+     */
     public boolean getOCREnabled() {
         return this.OCREnabled;
     }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
index 1850552077..8c46a80b1e 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
@@ -1,7 +1,20 @@
 /*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2018 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.sleuthkit.autopsy.textextractors.extractionconfigs;
 
@@ -9,35 +22,66 @@ import java.util.List;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 
 /**
- *
- * @author dsmyda
+ * Allows for configuration of the StringsTextExtractor (the default extractor
+ * for all content types).
  */
 public class StringsExtractionConfig {
     private boolean extractUTF8;
     private boolean extractUTF16;
     private List<SCRIPT> extractScripts;
 
+    /**
+     * Enables the UTF-8 encoding to be used during strings extraction.
+     * 
+     * @param enabled Flag indicating if UTF-8 should be turned on
+     */
     public void setExtractUTF8(boolean enabled) {
         this.extractUTF8 = enabled;
     }
 
+    /**
+     * Enables the UTF-16 encoding to be used during strings extraction.
+     * 
+     * @param enabled Flag indicating if UTF-16 should be turned on
+     */
     public void setExtractUTF16(boolean enabled) {
         this.extractUTF16 = enabled;
     }
 
+    /**
+     * Returns whether extracting with UTF-8 encoding should be done.
+     * 
+     * @return Flag indicating if UTF-8 has been turned on/off
+     */
     public boolean getExtractUTF8() {
         return extractUTF8;
     }
 
+    /**
+     * Return whether extracting with UTF-16 encoding should be done.
+     * 
+     * @return Flag indicating if UTF-16 has been turned on/off
+     */
     public boolean getExtractUTF16() { 
         return extractUTF16;
     }
     
+    /**
+     * Sets the type of extraction scripts that will be used during this 
+     * extraction.
+     * 
+     * @param scripts Desired set of scripts to be used during extraction
+     */
     public void setExtractScripts(List<SCRIPT> scripts) {
         this.extractScripts = scripts;
     }
     
+    /**
+     * Gets the desired set of scripts to be used during extraction.
+     * 
+     * @return Set of extraction scripts to be used
+     */
     public List<SCRIPT> getExtractScripts() {
         return this.extractScripts;
     }
-}
+}
\ No newline at end of file
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java
deleted file mode 100755
index b2cb62d728..0000000000
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-package org.sleuthkit.autopsy.textextractors.extractionconfigs;
-
-/**
- *
- * @author dsmyda
- */
-public class TextFileExtractionConfig {
-    private int minConfidenceInCharsetDetection;
-        
-    public void setMinConfidenceInCharsetDetection(int conf) {
-        this.minConfidenceInCharsetDetection = conf;
-    }
-
-    public int getMinConfidenceInCharsetDetection() {
-        return this.minConfidenceInCharsetDetection;
-    }
-}
diff --git a/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationException.java b/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationException.java
index 9c03f322dd..4d8703e585 100755
--- a/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationException.java
+++ b/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationException.java
@@ -19,7 +19,7 @@
 package org.sleuthkit.autopsy.texttranslation;
 
 /**
- * Provides a system exception for the Text Translation errors
+ * Provides a system exception for Text Translation errors
  */
 public class TranslationException extends Exception {
     
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 451e5300b2..8c25600099 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -45,7 +45,6 @@ import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
-import org.sleuthkit.autopsy.textextractors.extractionconfigs.HTMLExtractionConfig;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.StringsExtractionConfig;
 import org.sleuthkit.datamodel.AbstractFile;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
index 2156b35b7e..e8ddf1eea9 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@@ -26,7 +26,6 @@ import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
-import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
@@ -80,9 +79,4 @@ final class TextFileExtractor extends ContentTextExtractor {
     public void logWarning(String msg, Exception ex) {
         logger.log(Level.WARNING, msg, ex);
     }
-
-    @Override
-    public void parseContext(ExtractionContext context) {
-        //Nothing
-    }
 }

From 70768bb0e5358b46078bf630bbf285b2d51f1131 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Tue, 4 Dec 2018 11:37:53 -0500
Subject: [PATCH 05/18] Fixed typos, made strings internationalized and
 improved readability

---
 .../textextractors/ContentTextExtractor.java  | 13 +++--
 .../textextractors/ExtractionContext.java     |  2 +-
 .../textextractors/HtmlTextExtractor.java     | 56 ++++++++++---------
 .../textextractors/SqliteTextExtractor.java   |  5 +-
 .../textextractors/StringsTextExtractor.java  | 31 +++++-----
 .../autopsy/textextractors/TextExtractor.java | 18 +++---
 .../textextractors/TikaTextExtractor.java     |  7 ++-
 .../ImageFileExtractionConfig.java            |  2 +-
 .../StringsExtractionConfig.java              |  8 +--
 ...wordSearchGlobalLanguageSettingsPanel.java | 15 ++---
 .../KeywordSearchIngestModule.java            | 54 ++++--------------
 .../KeywordSearchJobSettingsPanel.java        |  6 +-
 .../keywordsearch/KeywordSearchSettings.java  |  9 +--
 13 files changed, 103 insertions(+), 123 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
index 54f55d6a86..956a740378 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
@@ -18,6 +18,7 @@
  */
 package org.sleuthkit.autopsy.textextractors;
 
+import com.google.common.collect.ImmutableList;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
@@ -31,16 +32,16 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
     
     //Mimetype groups to aassist extractor implementations in ignoring binary and 
     //archive files.
-    static final List<String> BINARY_MIME_TYPES
-            = Arrays.asList(
+    public static final List<String> BINARY_MIME_TYPES
+            = ImmutableList.of(
                     //ignore binary blob data, for which string extraction will be used
                     "application/octet-stream", //NON-NLS
                     "application/x-msdownload"); //NON-NLS
 
     /** generally text extractors should ignore archives and let unpacking
      * modules take care of them */
-    static final List<String> ARCHIVE_MIME_TYPES
-            = Arrays.asList(
+    public static final List<String> ARCHIVE_MIME_TYPES
+            = ImmutableList.of(
                     //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
                     "application/x-7z-compressed", //NON-NLS
                     "application/x-ace-compressed", //NON-NLS
@@ -75,7 +76,7 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
                     "application/x-compress"); //NON-NLS
 
     /**
-     * Accepts content instance to allow for extractor configuration on specific 
+     * Accepts context instance to allow for extractor configuration on specific 
      * file types. 
      * 
      * See extractionconfigs package for available file type configurations.
@@ -117,7 +118,7 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
      * Returns a reader that will iterate over the text of the source content.
      * 
      * @param source Content source to read
-     * @return A reader that contains all source text.
+     * @return A reader that contains all source text
      * @throws TextExtractorException Error encountered during extraction
      */
     @Override
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
index 03c33c1ad9..d1e267ea6c 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
@@ -38,7 +38,7 @@ public class ExtractionContext {
     /**
      * Internally stores a class-instance pair.
      *
-     * @param <T>            Class type that will be stored.
+     * @param <T>            Class type that will be stored
      * @param configClass    The class object of the instance
      * @param configInstance Config instance of type T
      */
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index 6f9088bfc1..2e301ac7a4 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -41,7 +41,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 public final class HtmlTextExtractor extends ContentTextExtractor {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
-    private int maxSize;
+    private final int MAX_SIZE;
 
     static final List<String> WEB_MIME_TYPES = Arrays.asList(
             "application/javascript", //NON-NLS
@@ -51,40 +51,40 @@ public final class HtmlTextExtractor extends ContentTextExtractor {
             "text/html", //NON-NLS NON-NLS
             "text/javascript" //NON-NLS
     );
-    
+
     static {
         // Disable Jericho HTML Parser log messages.
         Config.LoggerProvider = LoggerProvider.DISABLED;
     }
 
     /**
-     * Configures the extractor to use the settings HTMLExtractionConfig instance
-     * stored in the ExtractionContext object. 
-     * 
-     * As of now, there are no configurable features of this extractor.
-     * 
+     * Configures the extractor to use the settings in the HTMLExtractionConfig
+     * instance stored in the ExtractionContext object.
+     *
+     * As of now, there are no configurable features for this extractor.
+     *
      * @param context Instance containing config classes
      */
     public HtmlTextExtractor(ExtractionContext context) {
         this();
     }
-    
+
     /**
-     * Creates a default instance of HtmlTextExtractor. Supported file size
+     * Creates a default instance of the HtmlTextExtractor. Supported file size
      * is 50MB.
      */
     public HtmlTextExtractor() {
         //Set default to be 50 MB.
-        maxSize = 50_000_000;
+        MAX_SIZE = 50_000_000;
     }
 
     /**
-     * Determines if this extractor is responsible for extracting only a specific 
-     * type of media.
-     * 
+     * Determines if this extractor is responsible for extracting only a
+     * specific type of media.
+     *
      * In this case, only HTML documents can be read successfully.
-     * 
-     * @return true 
+     *
+     * @return true
      */
     @Override
     public boolean isContentTypeSpecific() {
@@ -93,25 +93,27 @@ public final class HtmlTextExtractor extends ContentTextExtractor {
 
     /**
      * Determines if this content type is supported by this extractor.
-     * 
-     * @param content Content instance to be analyzed
+     *
+     * @param content        Content instance to be analyzed
      * @param detectedFormat Mimetype of content instance
-     * @return flag indicating supporting
+     *
+     * @return flag indicating support
      */
     @Override
     public boolean isSupported(Content content, String detectedFormat) {
-        boolean notNull = detectedFormat != null;
-        boolean supported = WEB_MIME_TYPES.contains(detectedFormat);
-        boolean size = content.getSize() <= maxSize;
-        return notNull && supported && size;
+        return detectedFormat != null
+                && WEB_MIME_TYPES.contains(detectedFormat)
+                && content.getSize() <= MAX_SIZE;
     }
 
     /**
-     * Returns a reader that will iterate over the text of an Html document.
-     * 
+     * Returns a reader that will iterate over the text of an HTML document.
+     *
      * @param content Html document source
+     *
      * @return A reader instance containing the document source text
-     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
+     *
+     * @throws TextExtractorException
      */
     @Override
     public Reader getReader(Content content) throws TextExtractorException {
@@ -216,8 +218,8 @@ public final class HtmlTextExtractor extends ContentTextExtractor {
 
     /**
      * Indicates if this extractor can run.
-     * 
-     * @return Flag indicating if this extractor can run. 
+     *
+     * @return Flag indicating if this extractor can run.
      */
     @Override
     public boolean isDisabled() {
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index a57654b4b9..908935d88f 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -113,10 +113,9 @@ public final class SqliteTextExtractor extends ContentTextExtractor {
      *
      * @param source Content file
      *
-     * @return An InputStream that reads from a Sqlite database.
+     * @return An InputStream that reads from a Sqlite database
      *
-     * @throws
-     * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
+     * @throws TextExtractorException
      */
     @Override
     public Reader getReader(Content source) throws TextExtractorException {
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 450868e2e0..829f8d18c2 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -42,21 +42,14 @@ public final class StringsTextExtractor extends ContentTextExtractor {
     static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private boolean extractUTF8;
     private boolean extractUTF16;
-    
-    /**
-     * Options for this extractor
-     */
-    public enum ExtractOptions {
-        EXTRACT_UTF16, ///< extract UTF16 text, true/false
-        EXTRACT_UTF8, ///< extract UTF8 text, true/false
-    };
+    private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
 
     /**
      * Determines if this extractor may only read particular types of content.
      * 
      * Since Strings may be run on any content type, it is not content specific.
      * 
-     * @return 
+     * @return false 
      */
     @Override
     public boolean isContentTypeSpecific() {
@@ -67,7 +60,7 @@ public final class StringsTextExtractor extends ContentTextExtractor {
      * Determines if this extractor can read the content type. 
      * 
      * Note: Strings can be run on any type of content, so all types
-     * will return true;
+     * will return true.
      * 
      * @param file Content source to read
      * @param detectedFormat Mimetype of source file.
@@ -91,8 +84,12 @@ public final class StringsTextExtractor extends ContentTextExtractor {
         this();
         if(context != null && context.contains(StringsExtractionConfig.class)) {
             StringsExtractionConfig configInstance = context.get(StringsExtractionConfig.class);
-            extractUTF8 = configInstance.getExtractUTF8();
-            extractUTF16 = configInstance.getExtractUTF16();
+            if(Objects.nonNull(configInstance.getExtractUTF8())) {
+                extractUTF8 = configInstance.getExtractUTF8();
+            }
+            if(Objects.nonNull(configInstance.getExtractUTF16())) {
+                extractUTF16 = configInstance.getExtractUTF16();
+            }
             if(Objects.nonNull(configInstance.getExtractScripts())) {
                 setScripts(configInstance.getExtractScripts());
             }
@@ -116,6 +113,10 @@ public final class StringsTextExtractor extends ContentTextExtractor {
      * @param extractScripts scripts to use
      */
     public final void setScripts(List<SCRIPT> extractScripts) {
+        if(extractScripts == null) {
+            return;
+        }
+        
         this.extractScripts.clear();
         this.extractScripts.addAll(extractScripts);
     }
@@ -157,7 +158,7 @@ public final class StringsTextExtractor extends ContentTextExtractor {
     @Override
     public InputStreamReader getReader(Content content) throws TextExtractorException {
         InputStream stringStream = getInputStream(content);
-        return new InputStreamReader(stringStream, Charset.forName("UTF-8"));
+        return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
     }
 
     InputStream getInputStream(Content content) {
@@ -356,7 +357,7 @@ public final class StringsTextExtractor extends ContentTextExtractor {
         private int copyToReturn(byte[] b, int off, long len) {
             final String curStringS = curString.toString();
             //logger.log(Level.INFO, curStringS);
-            byte[] stringBytes = curStringS.getBytes(Charset.forName("UTF-8"));
+            byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
             System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
             //logger.log(Level.INFO, curStringS);
             //copied all string, reset
@@ -527,7 +528,7 @@ public final class StringsTextExtractor extends ContentTextExtractor {
          */
         private void convert(int numBytes) {
             lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
-            convertBuff = lastExtractResult.getText().getBytes(Charset.forName("UTF-8"));
+            convertBuff = lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
             //reset tracking vars
             if (lastExtractResult.getNumBytes() == 0) {
                 bytesInConvertBuff = 0;
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index 598a4cef20..ae3a7c8f6a 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -25,11 +25,11 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
  * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
  * This Reader is given to the Ingester to chunk and index in Solr.
  *
- * @param <T> The subtype of SleuthkitVisitableItem an implementation
- *                     is able to process.
+ * @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
+ *            process.
  */
 public interface TextExtractor<T extends SleuthkitVisitableItem> {
-    
+
     /**
      * Is this extractor configured such that no extraction will/should be done?
      *
@@ -41,17 +41,19 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      * Log the given message and exception as a warning.
      *
      * @param msg Log message
-     * @param ex Exception associated with the incoming message
+     * @param ex  Exception associated with the incoming message
      */
     abstract void logWarning(String msg, Exception ex);
 
     /**
-     * Get a reader that will iterate over the text extracted from the given source.
+     * Get a reader that will iterate over the text extracted from the given
+     * source.
      *
-     * @param source 
+     * @param source source content of type T
      *
-     * @return
-     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
+     * @return Reader instance that contains the text of the source
+     *
+     * @throws TextExtractorException
      */
     abstract Reader getReader(T source) throws TextExtractorException;
 
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 3549bb9a6e..6a8b267799 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -25,12 +25,14 @@ import java.io.PushbackReader;
 import java.io.Reader;
 import java.nio.file.Paths;
 import java.util.List;
+import java.util.Objects;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
+import java.util.logging.Logger;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.tika.Tika;
@@ -56,6 +58,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 public final class TikaTextExtractor extends ContentTextExtractor {
     
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
+    
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
 
@@ -84,7 +87,9 @@ public final class TikaTextExtractor extends ContentTextExtractor {
     public TikaTextExtractor(ExtractionContext context) {
         if(context != null && context.contains(ImageFileExtractionConfig.class)) {
             ImageFileExtractionConfig configInstance = context.get(ImageFileExtractionConfig.class);
-            this.tesseractOCREnabled = configInstance.getOCREnabled();
+            if(Objects.nonNull(configInstance.getOCREnabled())) {
+                this.tesseractOCREnabled = configInstance.getOCREnabled();
+            }
         }
     }
     
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
index 4f5a16220c..8051c8c8c2 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
@@ -22,7 +22,7 @@ package org.sleuthkit.autopsy.textextractors.extractionconfigs;
  * Allows for configuration of image file extraction.
  */
 public class ImageFileExtractionConfig {
-    private boolean OCREnabled;
+    private Boolean OCREnabled;
     
     /**
      * Enables OCR to be run on the text extractor responsible for handling
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
index 8c46a80b1e..962e972a7f 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
@@ -26,8 +26,8 @@ import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.S
  * for all content types).
  */
 public class StringsExtractionConfig {
-    private boolean extractUTF8;
-    private boolean extractUTF16;
+    private Boolean extractUTF8;
+    private Boolean extractUTF16;
     private List<SCRIPT> extractScripts;
 
     /**
@@ -53,7 +53,7 @@ public class StringsExtractionConfig {
      * 
      * @return Flag indicating if UTF-8 has been turned on/off
      */
-    public boolean getExtractUTF8() {
+    public Boolean getExtractUTF8() {
         return extractUTF8;
     }
 
@@ -62,7 +62,7 @@ public class StringsExtractionConfig {
      * 
      * @return Flag indicating if UTF-16 has been turned on/off
      */
-    public boolean getExtractUTF16() { 
+    public Boolean getExtractUTF16() { 
         return extractUTF16;
     }
     
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
index 8a5745c974..1ef66e2a3a 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
@@ -36,6 +36,7 @@ import org.sleuthkit.autopsy.coreutils.PlatformUtil;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 import org.sleuthkit.autopsy.ingest.IngestManager;
+import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
 
 /**
  * Child panel of the global settings panel (Languages tab).
@@ -46,7 +47,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
     private final Map<String, StringExtract.StringExtractUnicodeTable.SCRIPT> scripts = new HashMap<>();
     private ActionListener updateLanguagesAction;
     private List<SCRIPT> toUpdate;
-
+    
     KeywordSearchGlobalLanguageSettingsPanel() {
         initComponents();
         customizeComponents();
@@ -126,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     private void reloadScriptsCheckBoxes() {
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF16"));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString()));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF8"));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString()));
         enableUTF8Checkbox.setSelected(utf8);
 
         boolean ocr = KeywordSearchSettings.getOcrOption();
@@ -153,12 +154,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
         reloadScriptsCheckBoxes();
 
         boolean utf16
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF16"));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString()));
 
         enableUTF16Checkbox.setSelected(utf16);
 
         boolean utf8
-                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF8"));
+                = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString()));
         enableUTF8Checkbox.setSelected(utf8);
         final boolean extractEnabled = utf16 || utf8;
 
@@ -317,9 +318,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
 
     @Override
     public void store() {
-        KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF8",
+        KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString(),
                 Boolean.toString(enableUTF8Checkbox.isSelected()));
-        KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF16",
+        KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString(),
                 Boolean.toString(enableUTF16Checkbox.isSelected()));
         KeywordSearchSettings.setOcrOption(enableOcrCheckbox.isSelected());
 
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 8c25600099..8e663878fa 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -69,46 +69,14 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
 })
 public final class KeywordSearchIngestModule implements FileIngestModule {
     
-    static final List<String> ARCHIVE_MIME_TYPES
-            = Arrays.asList(
-                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
-                    "application/x-7z-compressed", //NON-NLS
-                    "application/x-ace-compressed", //NON-NLS
-                    "application/x-alz-compressed", //NON-NLS
-                    "application/x-arj", //NON-NLS
-                    "application/vnd.ms-cab-compressed", //NON-NLS
-                    "application/x-cfs-compressed", //NON-NLS
-                    "application/x-dgc-compressed", //NON-NLS
-                    "application/x-apple-diskimage", //NON-NLS
-                    "application/x-gca-compressed", //NON-NLS
-                    "application/x-dar", //NON-NLS
-                    "application/x-lzx", //NON-NLS
-                    "application/x-lzh", //NON-NLS
-                    "application/x-rar-compressed", //NON-NLS
-                    "application/x-stuffit", //NON-NLS
-                    "application/x-stuffitx", //NON-NLS
-                    "application/x-gtar", //NON-NLS
-                    "application/x-archive", //NON-NLS
-                    "application/x-executable", //NON-NLS
-                    "application/x-gzip", //NON-NLS
-                    "application/zip", //NON-NLS
-                    "application/x-zoo", //NON-NLS
-                    "application/x-cpio", //NON-NLS
-                    "application/x-shar", //NON-NLS
-                    "application/x-tar", //NON-NLS
-                    "application/x-bzip", //NON-NLS
-                    "application/x-bzip2", //NON-NLS
-                    "application/x-lzip", //NON-NLS
-                    "application/x-lzma", //NON-NLS
-                    "application/x-lzop", //NON-NLS
-                    "application/x-z", //NON-NLS
-                    "application/x-compress"); //NON-NLS
-    
-    static final List<String> BINARY_MIME_TYPES
-            = Arrays.asList(
-                    //ignore binary blob data, for which string extraction will be used
-                    "application/octet-stream", //NON-NLS
-                    "application/x-msdownload"); //NON-NLS
+    /**
+     * Options for this extractor
+     */
+    enum StringsExtractOptions {
+        EXTRACT_UTF16, ///< extract UTF16 text, true/false
+        EXTRACT_UTF8, ///< extract UTF8 text, true/false
+    };
+
 
     enum UpdateFrequency {
 
@@ -287,8 +255,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         
         StringsExtractionConfig stringsConfig = new StringsExtractionConfig();
         Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
-        stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get("EXTRACT_UTF8")));
-        stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get("EXTRACT_UTF16")));
+        stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
+        stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
         stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
         
         extractionContext.set(StringsExtractionConfig.class, stringsConfig);
@@ -563,7 +531,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
 
             // we skip archive formats that are opened by the archive module. 
             // @@@ We could have a check here to see if the archive module was enabled though...
-            if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(fileType)) {
+            if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
                 try {
                     if (context.fileIngestIsCancelled()) {
                         return;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
index 10ca43b68b..379a4b37bd 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchJobSettingsPanel.java
@@ -18,7 +18,6 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
 import java.beans.PropertyChangeEvent;
 import java.beans.PropertyChangeListener;
 import java.util.ArrayList;
@@ -32,6 +31,7 @@ import javax.swing.table.TableColumn;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettings;
 import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettingsPanel;
+import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
 
 /**
  * Ingest job settings panel for keyword search file ingest modules.
@@ -103,8 +103,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
     }
 
     private void displayEncodings() {
-        String utf8 = KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF8");
-        String utf16 = KeywordSearchSettings.getStringExtractOption("EXTRACT_UTF16");
+        String utf8 = KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString());
+        String utf16 = KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString());
         ArrayList<String> encodingsList = new ArrayList<>();
         if (utf8 == null || Boolean.parseBoolean(utf8)) {
             encodingsList.add("UTF8");
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
index d3a61cb876..91043ee9a0 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchSettings.java
@@ -28,6 +28,7 @@ import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.ModuleSettings;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
+import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
 import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.UpdateFrequency;
 
 //This file contains constants and settings for KeywordSearch
@@ -234,14 +235,14 @@ class KeywordSearchSettings {
             KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
         }
         //setting default Extract UTF8
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, "EXTRACT_UTF8")) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsExtractOptions.EXTRACT_UTF8.toString())) {
             logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF8", Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
         }
         //setting default Extract UTF16
-        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, "EXTRACT_UTF16")) {
+        if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsExtractOptions.EXTRACT_UTF16.toString())) {
             logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
-            KeywordSearchSettings.setStringExtractOption("EXTRACT_UTF16", Boolean.TRUE.toString());
+            KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
         }
         //setting OCR default (disabled by default)
         if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, OCR_ENABLED)) {

From fc64b508e5370f3ad40ccac91d83f1c9427c125d Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Wed, 5 Dec 2018 09:28:14 -0500
Subject: [PATCH 06/18] Restructed the text extractors in a way that they could
 be better configured, non-public, and more consistent.

---
 .../textextractors/ContentTextExtractor.java  |  19 ----
 .../textextractors/HtmlTextExtractor.java     |  27 ++---
 .../textextractors/SqliteTextExtractor.java   |  32 +++---
 .../textextractors/StringsTextExtractor.java  | 104 +++++++++---------
 .../autopsy/textextractors/TextExtractor.java |  21 +++-
 .../textextractors/TextExtractorFactory.java  |  79 +++++++------
 .../textextractors/TikaTextExtractor.java     |  47 ++++----
 ...nfig.java => DefaultExtractionConfig.java} |   2 +-
 .../keywordsearch/ArtifactTextExtractor.java  |   5 +
 ...wordSearchGlobalLanguageSettingsPanel.java |   1 -
 .../KeywordSearchIngestModule.java            |  19 ++--
 .../keywordsearch/SolrSearchService.java      |  12 +-
 .../keywordsearch/TextFileExtractor.java      |   5 +
 13 files changed, 193 insertions(+), 180 deletions(-)
 rename Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/{StringsExtractionConfig.java => DefaultExtractionConfig.java} (98%)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
index 956a740378..49d727e636 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
@@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.textextractors;
 
 import com.google.common.collect.ImmutableList;
 import java.io.Reader;
-import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.Content;
 
@@ -74,24 +73,6 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
                     "application/x-lzop", //NON-NLS
                     "application/x-z", //NON-NLS
                     "application/x-compress"); //NON-NLS
-
-    /**
-     * Accepts context instance to allow for extractor configuration on specific 
-     * file types. 
-     * 
-     * See extractionconfigs package for available file type configurations.
-     * 
-     * @param context Instance that contains config classes 
-     */
-    public ContentTextExtractor(ExtractionContext context) {
-        
-    }
-    
-    /**
-     * Default constructor to create extractor instances with default configurations.
-     */
-    public ContentTextExtractor() {
-    }
     
     /**
      * Determines if the extractor works only for specified types is
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index 2e301ac7a4..3aa61700f5 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 /**
  * Extracts text from HTML content.
  */
-public final class HtmlTextExtractor extends ContentTextExtractor {
+final class HtmlTextExtractor extends ContentTextExtractor {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
     private final int MAX_SIZE;
@@ -57,18 +57,6 @@ public final class HtmlTextExtractor extends ContentTextExtractor {
         Config.LoggerProvider = LoggerProvider.DISABLED;
     }
 
-    /**
-     * Configures the extractor to use the settings in the HTMLExtractionConfig
-     * instance stored in the ExtractionContext object.
-     *
-     * As of now, there are no configurable features for this extractor.
-     *
-     * @param context Instance containing config classes
-     */
-    public HtmlTextExtractor(ExtractionContext context) {
-        this();
-    }
-
     /**
      * Creates a default instance of the HtmlTextExtractor. Supported file size
      * is 50MB.
@@ -230,4 +218,17 @@ public final class HtmlTextExtractor extends ContentTextExtractor {
     public void logWarning(final String msg, Exception ex) {
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
+
+    /**
+     * Determines how the extraction process will proceed given the settings 
+     * stored in this context instance.
+     * 
+     * As of now, there are no configurable settings for the HtmlTextExtractor.
+     * See the extractionconfigs package for available file configurations.
+     * 
+     * @param context Instance containing config classes
+     */
+    @Override
+    public void setExtractionSettings(ExtractionContext context) {
+    }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index 908935d88f..ec7cce0a85 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -40,28 +40,11 @@ import org.sleuthkit.datamodel.AbstractFile;
  *  2) Tables that contain spaces in their name are not extracted
  *  3) Table names are not included in its output text
  */
-public final class SqliteTextExtractor extends ContentTextExtractor {
+final class SqliteTextExtractor extends ContentTextExtractor {
 
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
 
-    /**
-     * Accepts a context instance for run-time configuration.
-     * 
-     * As of now, this constructor is a no-op as it does not support 
-     * any type of configuration.
-     * 
-     * @param context Instance that contains config classes
-     */
-    public SqliteTextExtractor(ExtractionContext context) {
-    }
-    
-    /**
-     * Creates a default SqliteTextExtractor instance.
-     */
-    public SqliteTextExtractor() {
-    }
-
     /**
      * This extractor only works for sqlite files, so it is indeed content type
      * specific. 
@@ -131,6 +114,19 @@ public final class SqliteTextExtractor extends ContentTextExtractor {
         return new SQLiteStreamReader((AbstractFile) source);
     }
 
+    /**
+     * Determines how the extraction process will proceed given the settings 
+     * stored in this context instance.
+     * 
+     * As of now, there are no configurable settings for the SqliteTextExtractor.
+     * See the extractionconfigs package for available file configurations.
+     * 
+     * @param context Instance containing config classes
+     */
+    @Override
+    public void setExtractionSettings(ExtractionContext context) {
+    }
+
     /**
      * Produces a continuous stream of characters from a database file. To
      * achieve this, all table names are queues up and a SQLiteTableReader is
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 829f8d18c2..2df9ce4c31 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -29,7 +29,7 @@ import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.autopsy.textextractors.extractionconfigs.StringsExtractionConfig;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
 import org.sleuthkit.datamodel.TskException;
@@ -37,8 +37,8 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-public final class StringsTextExtractor extends ContentTextExtractor {
-      
+final class StringsTextExtractor extends ContentTextExtractor {
+
     static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private boolean extractUTF8;
     private boolean extractUTF16;
@@ -46,10 +46,10 @@ public final class StringsTextExtractor extends ContentTextExtractor {
 
     /**
      * Determines if this extractor may only read particular types of content.
-     * 
+     *
      * Since Strings may be run on any content type, it is not content specific.
-     * 
-     * @return false 
+     *
+     * @return false
      */
     @Override
     public boolean isContentTypeSpecific() {
@@ -57,13 +57,14 @@ public final class StringsTextExtractor extends ContentTextExtractor {
     }
 
     /**
-     * Determines if this extractor can read the content type. 
-     * 
-     * Note: Strings can be run on any type of content, so all types
-     * will return true.
-     * 
-     * @param file Content source to read
+     * Determines if this extractor can read the content type.
+     *
+     * Note: Strings can be run on any type of content, so all types will return
+     * true.
+     *
+     * @param file           Content source to read
      * @param detectedFormat Mimetype of source file.
+     *
      * @return true
      */
     @Override
@@ -72,34 +73,11 @@ public final class StringsTextExtractor extends ContentTextExtractor {
     }
 
     private final List<SCRIPT> extractScripts = new ArrayList<>();
-    
+
     /**
-     * Accepts a context instance for run-time configuration.
-     * 
-     * See StringsExtractionConfig.java for available extraction settings. 
-     * 
-     * @param context Instance that contains config classes.
-     */
-    public StringsTextExtractor(ExtractionContext context) {
-        this();
-        if(context != null && context.contains(StringsExtractionConfig.class)) {
-            StringsExtractionConfig configInstance = context.get(StringsExtractionConfig.class);
-            if(Objects.nonNull(configInstance.getExtractUTF8())) {
-                extractUTF8 = configInstance.getExtractUTF8();
-            }
-            if(Objects.nonNull(configInstance.getExtractUTF16())) {
-                extractUTF16 = configInstance.getExtractUTF16();
-            }
-            if(Objects.nonNull(configInstance.getExtractScripts())) {
-                setScripts(configInstance.getExtractScripts());
-            }
-        }
-    }
-    
-    /**
-     * Creates a default StringsTextExtractor instance. The instance will be 
-     * configured to run only LATIN_2 as its default extraction script and 
-     * UTF-8 as its default encoding.
+     * Creates a default StringsTextExtractor instance. The instance will be
+     * configured to run only LATIN_2 as its default extraction script and UTF-8
+     * as its default encoding.
      */
     public StringsTextExtractor() {
         //LATIN_2 is the default script
@@ -113,10 +91,10 @@ public final class StringsTextExtractor extends ContentTextExtractor {
      * @param extractScripts scripts to use
      */
     public final void setScripts(List<SCRIPT> extractScripts) {
-        if(extractScripts == null) {
+        if (extractScripts == null) {
             return;
         }
-        
+
         this.extractScripts.clear();
         this.extractScripts.addAll(extractScripts);
     }
@@ -137,10 +115,10 @@ public final class StringsTextExtractor extends ContentTextExtractor {
 
     /**
      * Determines if this extractor should be run or not.
-     * 
-     * Atleast one of the extraction encodings in StringsExtractionConfig must 
+     *
+     * Atleast one of the extraction encodings in DefaultExtractionConfig must
      * be set for this extractor to run.
-     * 
+     *
      * @return Flag indicating if this extractor should be run.
      */
     @Override
@@ -150,10 +128,13 @@ public final class StringsTextExtractor extends ContentTextExtractor {
 
     /**
      * Returns a reader that will iterate over the text of the content source.
-     * 
+     *
      * @param content Content source of any type
+     *
      * @return A reader instance that content text can be obtained from
-     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
+     *
+     * @throws
+     * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
     @Override
     public InputStreamReader getReader(Content content) throws TextExtractorException {
@@ -169,10 +150,35 @@ public final class StringsTextExtractor extends ContentTextExtractor {
             return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
         }
     }
-    
+
     /**
-     * Content input string stream reader/converter - given Content,
-     * extract strings from it and return encoded bytes via read()
+     * Determines how the extraction process will proceed given the settings 
+     * stored in this context instance.
+     *
+     * See the DefaultExtractionConfig class in the extractionconfigs package
+     * for available settings.
+     *
+     * @param context Instance containing config classes
+     */
+    @Override
+    public void setExtractionSettings(ExtractionContext context) {
+        if (context != null && context.contains(DefaultExtractionConfig.class)) {
+            DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
+            if (Objects.nonNull(configInstance.getExtractUTF8())) {
+                extractUTF8 = configInstance.getExtractUTF8();
+            }
+            if (Objects.nonNull(configInstance.getExtractUTF16())) {
+                extractUTF16 = configInstance.getExtractUTF16();
+            }
+            if (Objects.nonNull(configInstance.getExtractScripts())) {
+                setScripts(configInstance.getExtractScripts());
+            }
+        }
+    }
+
+    /**
+     * Content input string stream reader/converter - given Content, extract
+     * strings from it and return encoded bytes via read()
      *
      * Note: the utility supports extraction of only LATIN script and UTF8,
      * UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index ae3a7c8f6a..b92c25118f 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -35,7 +35,7 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      *
      * @return True if this extractor will/should not perform any extraction.
      */
-    abstract boolean isDisabled();
+    boolean isDisabled();
 
     /**
      * Log the given message and exception as a warning.
@@ -43,7 +43,7 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      * @param msg Log message
      * @param ex  Exception associated with the incoming message
      */
-    abstract void logWarning(String msg, Exception ex);
+    void logWarning(String msg, Exception ex);
 
     /**
      * Get a reader that will iterate over the text extracted from the given
@@ -55,7 +55,7 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      *
      * @throws TextExtractorException
      */
-    abstract Reader getReader(T source) throws TextExtractorException;
+    Reader getReader(T source) throws TextExtractorException;
 
     /**
      * Get the 'object' id of the given source.
@@ -64,7 +64,7 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      *
      * @return Object id of the source content
      */
-    abstract long getID(T source);
+    long getID(T source);
 
     /**
      * Get a human readable name for the given source.
@@ -73,7 +73,18 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      *
      * @return Name of the content source
      */
-    abstract String getName(T source);
+    String getName(T source);
+    
+       
+    /**
+     * Determines how the extraction process will proceed given the settings 
+     * stored in this context instance.
+     * 
+     * See the extractionconfigs package for available file configurations.
+     * 
+     * @param context Instance containing file config classes
+     */
+    void setExtractionSettings(ExtractionContext context);
 
     /**
      * System exception for dealing with errors encountered during extraction.
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index 48c9f3f9bb..e899b75112 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -19,16 +19,16 @@
 package org.sleuthkit.autopsy.textextractors;
 
 import com.google.common.collect.ImmutableList;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.Content;
 
 /**
  * Factory for creating text extractors given a source file and a mimetype.
  *
- * See ContentTextExtractor.java for the generic structure of such extractors.
+ * See TextExtractor.java for the generic structure of such extractors.
  */
 public class TextExtractorFactory {
 
@@ -52,38 +52,51 @@ public class TextExtractorFactory {
      * keep the extractors at default settings. Refer to the extractionconfigs
      * package for available file configurations.
      *
-     * @param file     AbstractFile that will be read from
-     * @param mimeType Mimetype of source file
-     * @param context  Contains extraction configurations for certain file types
+     * @param file    Content source that will be read from
+     * @param context Contains extraction configurations for certain file types
      *
      * @return A ContentTextExtractor instance that is properly configured and
      *         can be read from the getReader() method.
      *
-     * @throws NoSpecializedExtractorException In the event that the inputted
-     *                                         file and mimetype have no
-     *                                         corresponding extractor
+     * @throws NoContentSpecificExtractorException In the event that the
+     *                                             inputted file and mimetype
+     *                                             have no corresponding
+     *                                             extractor
      */
-    public static ContentTextExtractor getSpecializedExtractor(AbstractFile file,
-            String mimeType, ExtractionContext context) throws NoSpecializedExtractorException {
-        for (Class<?> candidate : extractors) {
-            try {
-                Constructor<?> constructor = candidate.getDeclaredConstructor(ExtractionContext.class);
-                constructor.setAccessible(true);
-                ContentTextExtractor newInstance = (ContentTextExtractor) constructor.newInstance(context);
-                if (newInstance.isSupported(file, mimeType)) {
-                    return newInstance;
+    public static TextExtractor getContentSpecificExtractor(Content file,
+            ExtractionContext context) throws NoContentSpecificExtractorException {
+        if (file instanceof AbstractFile) {
+            String mimeType = ((AbstractFile) file).getMIMEType();
+            for (Class<?> candidate : extractors) {
+                try {
+                    ContentTextExtractor newInstance = (ContentTextExtractor) candidate.newInstance();
+                    newInstance.setExtractionSettings(context);
+                    if (newInstance.isSupported(file, mimeType)) {
+                        return newInstance;
+                    }
+                } catch (SecurityException | InstantiationException | IllegalAccessException
+                        | IllegalArgumentException ex) {
+                    logger.log(Level.SEVERE, String.format("Could not instantiate ContentTextExtractor "
+                            + "instance for file %s, objId=%d and mimeType=%s", file.getName(),
+                            file.getId(), mimeType), ex);
                 }
-            } catch (NoSuchMethodException | SecurityException
-                    | InstantiationException | IllegalAccessException
-                    | IllegalArgumentException | InvocationTargetException ex) {
-                logger.log(Level.SEVERE, String.format("Could not instantiate ContentTextExtractor "
-                        + "instance for file %s, objId=%d and mimeType=%s", file.getName(),
-                        file.getId(), mimeType), ex);
             }
+        } else if (!(file instanceof BlackboardArtifact)) {
+            TikaTextExtractor tikaExtractor = new TikaTextExtractor();
+            tikaExtractor.setExtractionSettings(context);
+            return tikaExtractor;
         }
-
-        throw new NoSpecializedExtractorException("Could not find a suitable extractor for "
-                + "mimetype [" + mimeType + "]");
+        /*
+         * TODO JIRA-4468 - There should be an additional check for
+         * BlackboardArtifact instances. We should be returning the
+         * ArtifactTextExtractor rather than throwing an exception.
+         */
+        throw new NoContentSpecificExtractorException(
+                String.format("Could not find a suitable extractor for "
+                        + "file with name [%s] and id=[%d]. Use the default, "
+                        + "non content specific extractor as an alternative.",
+                        file.getName(), file.getId())
+        );
     }
 
     /**
@@ -95,17 +108,19 @@ public class TextExtractorFactory {
      *
      * @return A StringsTextExtractor instance
      */
-    public static StringsTextExtractor getDefaultExtractor(ExtractionContext context) {
-        return new StringsTextExtractor(context);
+    public static TextExtractor getDefaultExtractor(ExtractionContext context) {
+        TextExtractor stringsInstance = new StringsTextExtractor();
+        stringsInstance.setExtractionSettings(context);
+        return stringsInstance;
     }
 
     /**
-     * System level exception for handling content types that have no specific 
+     * System level exception for handling content types that have no specific
      * strategy defined for extracting their text.
      */
-    public static class NoSpecializedExtractorException extends Exception {
+    public static class NoContentSpecificExtractorException extends Exception {
 
-        public NoSpecializedExtractorException(String msg) {
+        public NoContentSpecificExtractorException(String msg) {
             super(msg);
         }
     }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 6a8b267799..f8053f324b 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -32,7 +32,6 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
-import java.util.logging.Logger;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.tika.Tika;
@@ -55,7 +54,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * Extracts text from Tika supported content. Protects against Tika
  * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
  */
-public final class TikaTextExtractor extends ContentTextExtractor {
+final class TikaTextExtractor extends ContentTextExtractor {
     
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
     
@@ -75,31 +74,6 @@ public final class TikaTextExtractor extends ContentTextExtractor {
                     .map(mt -> mt.getType() + "/" + mt.getSubtype())
                     .collect(Collectors.toList());
 
-    /**
-     * Accepts a context instance for run-time configuration.
-     * 
-     * The only configuration that is available to date is the 
-     * ImageFileExtractionConfig.java. You may refer to this class for
-     * supported settings.
-     * 
-     * @param context Instance that contains config classes
-     */
-    public TikaTextExtractor(ExtractionContext context) {
-        if(context != null && context.contains(ImageFileExtractionConfig.class)) {
-            ImageFileExtractionConfig configInstance = context.get(ImageFileExtractionConfig.class);
-            if(Objects.nonNull(configInstance.getOCREnabled())) {
-                this.tesseractOCREnabled = configInstance.getOCREnabled();
-            }
-        }
-    }
-    
-    /**
-     * Creates a default TikaTextExtractor. OCR is turned off by default due to speed
-     * concerns.
-     */
-    public TikaTextExtractor() {    
-    }
-
     @Override
     public void logWarning(final String msg, Exception ex) {
         tikaLogger.log(Level.WARNING, msg, ex);
@@ -294,6 +268,25 @@ public final class TikaTextExtractor extends ContentTextExtractor {
 
     }
 
+    /**
+     * Determines how the extraction process will proceed given the settings 
+     * stored in this context instance.
+     *
+     * See the ImageFileExtractionConfig class in the extractionconfigs package
+     * for available settings.
+     *
+     * @param context Instance containing config classes
+     */
+    @Override
+    public void setExtractionSettings(ExtractionContext context) {
+        if(context != null && context.contains(ImageFileExtractionConfig.class)) {
+            ImageFileExtractionConfig configInstance = context.get(ImageFileExtractionConfig.class);
+            if(Objects.nonNull(configInstance.getOCREnabled())) {
+                this.tesseractOCREnabled = configInstance.getOCREnabled();
+            }
+        }    
+    }
+
     /**
      * An implementation of CharSource that just wraps an existing reader and
      * returns it in openStream().
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
similarity index 98%
rename from Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
index 962e972a7f..c852d7baa1 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
@@ -25,7 +25,7 @@ import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.S
  * Allows for configuration of the StringsTextExtractor (the default extractor
  * for all content types).
  */
-public class StringsExtractionConfig {
+public class DefaultExtractionConfig {
     private Boolean extractUTF8;
     private Boolean extractUTF16;
     private List<SCRIPT> extractScripts;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
index cb01961c6a..c7d0de1f32 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@@ -28,6 +28,7 @@ import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
@@ -148,4 +149,8 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
     public String getName(BlackboardArtifact source) {
         return source.getDisplayName() + "_" + source.getArtifactID();
     }
+
+    @Override
+    public void setExtractionSettings(ExtractionContext context) {
+    }
 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
index 1ef66e2a3a..e11cdd1565 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java
@@ -18,7 +18,6 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
 import java.awt.EventQueue;
 import java.awt.GridLayout;
 import java.awt.event.ActionEvent;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 8e663878fa..7a5e946c34 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -19,13 +19,11 @@
 package org.sleuthkit.autopsy.keywordsearch;
 
 import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
-import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
 import org.sleuthkit.autopsy.casemodule.Case;
@@ -43,11 +41,12 @@ import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.ExtractionContext;
-import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
-import org.sleuthkit.autopsy.textextractors.extractionconfigs.StringsExtractionConfig;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskData;
 import org.sleuthkit.datamodel.TskData.FileKnown;
 
@@ -105,7 +104,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private StringsTextExtractor stringExtractor;
+    private TextExtractor<AbstractFile> stringExtractor;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
     private long jobId;
@@ -253,13 +252,13 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
 
         ExtractionContext extractionContext = new ExtractionContext();
         
-        StringsExtractionConfig stringsConfig = new StringsExtractionConfig();
+        DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
         Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
         stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
         stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
         stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
         
-        extractionContext.set(StringsExtractionConfig.class, stringsConfig);
+        extractionContext.set(DefaultExtractionConfig.class, stringsConfig);
         
         stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
         indexer = new Indexer();
@@ -441,7 +440,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            ContentTextExtractor extractor = null;
+            TextExtractor<AbstractFile> extractor = null;
             ExtractionContext extractionContext = new ExtractionContext();
             
             ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
@@ -449,10 +448,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
             
             try {
-                extractor = TextExtractorFactory.getSpecializedExtractor(aFile, detectedFormat, extractionContext);
+                extractor = TextExtractorFactory.getContentSpecificExtractor(aFile, extractionContext);
                 //divide into chunks and index
                 return Ingester.getDefault().indexText(extractor, aFile, context);
-            } catch (TextExtractorFactory.NoSpecializedExtractorException ex) {
+            } catch (TextExtractorFactory.NoContentSpecificExtractorException ex) {
                 //No text extractor found... run the default instead
                 return false;
             }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 8cf37859cd..69c463b546 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -33,6 +33,7 @@ import org.apache.commons.lang.math.NumberUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
 import org.openide.util.lookup.ServiceProviders;
@@ -45,8 +46,8 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
 import org.sleuthkit.autopsy.progress.ProgressIndicator;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.autopsy.textextractors.StringsTextExtractor;
-import org.sleuthkit.autopsy.textextractors.TikaTextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
@@ -121,11 +122,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
             }
         } else {
             try {
-                ingester.indexText(new TikaTextExtractor(), content, null);
-            } catch (Ingester.IngesterException ex) {
+                TextExtractor contentSpecificExtractor = TextExtractorFactory.getContentSpecificExtractor(content, null);
+                ingester.indexText(contentSpecificExtractor, content, null);
+            } catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
                 try {
                     // Try the StringsTextExtractor if Tika extractions fails.
-                    ingester.indexText(new StringsTextExtractor(), content, null);
+                    ingester.indexText(TextExtractorFactory.getDefaultExtractor(null), content, null);
                 } catch (Ingester.IngesterException ex1) {
                     throw new TskCoreException(ex.getCause().getMessage(), ex1);
                 }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
index e8ddf1eea9..e36926f82f 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@@ -26,6 +26,7 @@ import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
+import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
@@ -79,4 +80,8 @@ final class TextFileExtractor extends ContentTextExtractor {
     public void logWarning(String msg, Exception ex) {
         logger.log(Level.WARNING, msg, ex);
     }
+
+    @Override
+    public void setExtractionSettings(ExtractionContext context) {
+    }
 }

From 02bef176850ec5a8adde253f65708af57f695ec2 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Wed, 5 Dec 2018 09:48:36 -0500
Subject: [PATCH 07/18] Fixed warnings on build

---
 .../autopsy/textextractors/TextExtractorFactory.java        | 6 +++---
 .../autopsy/keywordsearch/KeywordSearchIngestModule.java    | 4 ++--
 .../sleuthkit/autopsy/keywordsearch/SolrSearchService.java  | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index e899b75112..ceda7e6cee 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -63,7 +63,7 @@ public class TextExtractorFactory {
      *                                             have no corresponding
      *                                             extractor
      */
-    public static TextExtractor getContentSpecificExtractor(Content file,
+    public static TextExtractor<Content> getContentSpecificExtractor(Content file,
             ExtractionContext context) throws NoContentSpecificExtractorException {
         if (file instanceof AbstractFile) {
             String mimeType = ((AbstractFile) file).getMIMEType();
@@ -108,8 +108,8 @@ public class TextExtractorFactory {
      *
      * @return A StringsTextExtractor instance
      */
-    public static TextExtractor getDefaultExtractor(ExtractionContext context) {
-        TextExtractor stringsInstance = new StringsTextExtractor();
+    public static TextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
+        TextExtractor<Content> stringsInstance = new StringsTextExtractor();
         stringsInstance.setExtractionSettings(context);
         return stringsInstance;
     }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 7a5e946c34..a2f0b108eb 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -104,7 +104,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private TextExtractor<AbstractFile> stringExtractor;
+    private TextExtractor<Content> stringExtractor;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
     private long jobId;
@@ -440,7 +440,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            TextExtractor<AbstractFile> extractor = null;
+            TextExtractor<Content> extractor = null;
             ExtractionContext extractionContext = new ExtractionContext();
             
             ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 69c463b546..d8566808e6 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -122,7 +122,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
             }
         } else {
             try {
-                TextExtractor contentSpecificExtractor = TextExtractorFactory.getContentSpecificExtractor(content, null);
+                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory.getContentSpecificExtractor(content, null);
                 ingester.indexText(contentSpecificExtractor, content, null);
             } catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
                 try {

From 38ab533dc1cc534050f46813bbf2cec66af376d6 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Wed, 5 Dec 2018 12:28:31 -0500
Subject: [PATCH 08/18] Refactored ArtifactTextExtractor out of KWS

---
 .../ArtifactTextExtractor.java                | 50 +------------------
 .../textextractors/ContentTextExtractor.java  |  2 +-
 .../autopsy/textextractors/TextExtractor.java |  2 +-
 .../textextractors/TextExtractorFactory.java  | 19 +++----
 .../autopsy/keywordsearch/Ingester.java       |  6 +--
 .../keywordsearch/SolrSearchService.java      | 19 ++++---
 6 files changed, 29 insertions(+), 69 deletions(-)
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/ArtifactTextExtractor.java (71%)

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
similarity index 71%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index c7d0de1f32..3b424cc21e 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
 
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -24,17 +24,11 @@ import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.logging.Level;
 import org.apache.commons.io.IOUtils;
-import org.sleuthkit.autopsy.casemodule.Case;
-import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
-import org.sleuthkit.autopsy.textextractors.ExtractionContext;
-import org.sleuthkit.autopsy.textextractors.TextExtractor;
-import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
 
 /**
@@ -45,46 +39,6 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
 
     static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
 
-    /**
-     * Get the Content that is the data source for the given artifact. //JMTODO:
-     * is there a prexisting method to do this?
-     *
-     * @param artifact
-     *
-     * @return The data source for the given artifact as a Content object, or
-     *         null if it could not be found.
-     *
-     * @throws TskCoreException if there is a problem accessing the case db.
-     */
-    static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
-
-        Case currentCase;
-        try {
-            currentCase = Case.getCurrentCaseThrows();
-        } catch (NoCurrentCaseException ignore) {
-            // thorown by Case.getCurrentOpenCase() if currentCase is null
-            return null;
-        }
-
-        SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
-        if (sleuthkitCase == null) {
-            return null;
-
-        }
-        Content dataSource;
-        AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
-        if (abstractFile != null) {
-            dataSource = abstractFile.getDataSource();
-        } else {
-            dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
-        }
-
-        if (dataSource == null) {
-            return null;
-        }
-        return dataSource;
-    }
-
     @Override
     public boolean isDisabled() {
         return false;
@@ -102,7 +56,7 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
 
         Content dataSource = null;
         try {
-            dataSource = getDataSource(artifact);
+            dataSource = artifact.getDataSource();
         } catch (TskCoreException tskCoreException) {
             throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
         }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
index 49d727e636..c97f93c06e 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
@@ -114,7 +114,7 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
     @Override
     public long getID(Content source) {
         return source.getId();
-    }
+}
 
     /**
      * Returns the human-readable name of the given content source.
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index b92c25118f..34ec0e147b 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -56,7 +56,7 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      * @throws TextExtractorException
      */
     Reader getReader(T source) throws TextExtractorException;
-
+       
     /**
      * Get the 'object' id of the given source.
      *
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index ceda7e6cee..ba172ce02d 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -24,11 +24,12 @@ import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.Report;
 
 /**
  * Factory for creating text extractors given a source file and a mimetype.
  *
- * See TextExtractor.java for the generic structure of such extractors.
+ * See TextExtractor interface for the generic structure of such extractors.
  */
 public class TextExtractorFactory {
 
@@ -81,19 +82,19 @@ public class TextExtractorFactory {
                             file.getId(), mimeType), ex);
                 }
             }
-        } else if (!(file instanceof BlackboardArtifact)) {
-            TikaTextExtractor tikaExtractor = new TikaTextExtractor();
+        } else if (file instanceof BlackboardArtifact) {
+            TextExtractor artifactExtractor = new ArtifactTextExtractor();
+            artifactExtractor.setExtractionSettings(context);
+            return artifactExtractor;
+        } else if (file instanceof Report) {
+            TextExtractor tikaExtractor = new TikaTextExtractor();
             tikaExtractor.setExtractionSettings(context);
             return tikaExtractor;
         }
-        /*
-         * TODO JIRA-4468 - There should be an additional check for
-         * BlackboardArtifact instances. We should be returning the
-         * ArtifactTextExtractor rather than throwing an exception.
-         */
+
         throw new NoContentSpecificExtractorException(
                 String.format("Could not find a suitable extractor for "
-                        + "file with name [%s] and id=[%d]. Use the default, "
+                        + "file with name [%s] and id=[%d]. Try using the default, "
                         + "non content specific extractor as an alternative.",
                         file.getName(), file.getId())
         );
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 459bf3710a..2a669772d9 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -106,8 +106,8 @@ class Ingester {
      * @throws IngesterException if there was an error processing a specific
      *                           artifact, but the Solr server is probably fine.
      */
-    void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
-        indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact));
+    void indexMetaDataOnly(BlackboardArtifact artifact, TextExtractor<Content> extractor) throws IngesterException {
+        indexChunk("", extractor.getName(artifact), getContentFields(artifact));
     }
 
     /**
@@ -371,7 +371,7 @@ class Ingester {
             Map<String, String> params = new HashMap<>();
             params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
             try {
-                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId()));
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
             } catch (TskCoreException ex) {
                 logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index d8566808e6..5661c6b4e4 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -115,14 +115,17 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
                 return;
             }
             try {
-                ingester.indexMetaDataOnly(artifact);
-                ingester.indexText(new ArtifactTextExtractor(), artifact, null);
-            } catch (Ingester.IngesterException ex) {
+                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
+                        .getContentSpecificExtractor(content, null);
+                ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
+                ingester.indexText(contentSpecificExtractor, artifact, null);
+            } catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
                 throw new TskCoreException(ex.getCause().getMessage(), ex);
             }
         } else {
             try {
-                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory.getContentSpecificExtractor(content, null);
+                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
+                        .getContentSpecificExtractor(content, null);
                 ingester.indexText(contentSpecificExtractor, content, null);
             } catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
                 try {
@@ -441,9 +444,11 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
         final Ingester ingester = Ingester.getDefault();
 
         try {
-            ingester.indexMetaDataOnly(artifact);
-            ingester.indexText(new ArtifactTextExtractor(), artifact, null);
-        } catch (Ingester.IngesterException ex) {
+            TextExtractor<Content> contentSpecificExtractor = 
+                    TextExtractorFactory.getContentSpecificExtractor((Content) artifact, null);
+            ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
+            ingester.indexText(contentSpecificExtractor, artifact, null);
+        } catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
     }

From eaaf3428a70d4a000b145262cc61fc7354484804 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Thu, 6 Dec 2018 08:37:28 -0500
Subject: [PATCH 09/18] Made progress on fixing the warnings

---
 Core/ivy.xml                                          |  4 ++++
 .../autopsy/textextractors/TextExtractorFactory.java  | 11 ++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Core/ivy.xml b/Core/ivy.xml
index dc1896882f..d57cc0cf5c 100644
--- a/Core/ivy.xml
+++ b/Core/ivy.xml
@@ -35,5 +35,9 @@
         <dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/>
         <dependency conf="core->default" org="net.htmlparser.jericho" name="jericho-html" rev="3.3"/>
         
+        <!-- Tika 1.14 seems to declare a (transitive?) dependency on cleartk-util 3.2.2, but the most recent
+        version available is 2.0.0  Overriding the version worked-->
+        <override  org="org.cleartk" module="cleartk-util" rev="2.0.0"/>
+        
     </dependencies>
 </ivy-module>
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index ba172ce02d..112fc7a44b 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -53,6 +53,7 @@ public class TextExtractorFactory {
      * keep the extractors at default settings. Refer to the extractionconfigs
      * package for available file configurations.
      *
+     * @param <T>
      * @param file    Content source that will be read from
      * @param context Contains extraction configurations for certain file types
      *
@@ -64,7 +65,7 @@ public class TextExtractorFactory {
      *                                             have no corresponding
      *                                             extractor
      */
-    public static TextExtractor<Content> getContentSpecificExtractor(Content file,
+    public static <T extends Content> TextExtractor<T> getContentSpecificExtractor(T file,
             ExtractionContext context) throws NoContentSpecificExtractorException {
         if (file instanceof AbstractFile) {
             String mimeType = ((AbstractFile) file).getMIMEType();
@@ -73,7 +74,7 @@ public class TextExtractorFactory {
                     ContentTextExtractor newInstance = (ContentTextExtractor) candidate.newInstance();
                     newInstance.setExtractionSettings(context);
                     if (newInstance.isSupported(file, mimeType)) {
-                        return newInstance;
+                        return (TextExtractor<T>) newInstance;
                     }
                 } catch (SecurityException | InstantiationException | IllegalAccessException
                         | IllegalArgumentException ex) {
@@ -83,11 +84,11 @@ public class TextExtractorFactory {
                 }
             }
         } else if (file instanceof BlackboardArtifact) {
-            TextExtractor artifactExtractor = new ArtifactTextExtractor();
+            TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor();
             artifactExtractor.setExtractionSettings(context);
-            return artifactExtractor;
+            return (TextExtractor<T>) artifactExtractor;
         } else if (file instanceof Report) {
-            TextExtractor tikaExtractor = new TikaTextExtractor();
+            TextExtractor<T> tikaExtractor = (TextExtractor<T>) new TikaTextExtractor();
             tikaExtractor.setExtractionSettings(context);
             return tikaExtractor;
         }

From ce548fb978f246e0e2917d101d39861600884ba5 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 7 Dec 2018 10:01:59 -0500
Subject: [PATCH 10/18] Fixed warnings completely

---
 .../textextractors/ArtifactTextExtractor.java | 32 +++++--
 .../textextractors/ContentTextExtractor.java  | 13 +--
 .../textextractors/HtmlTextExtractor.java     |  3 +-
 .../textextractors/SqliteTextExtractor.java   | 39 +++++----
 .../textextractors/StringsTextExtractor.java  |  2 +-
 .../textextractors/TextExtractorFactory.java  | 83 +++++++++----------
 .../textextractors/TikaTextExtractor.java     |  2 +-
 .../KeywordSearchIngestModule.java            |  2 +-
 .../keywordsearch/TextFileExtractor.java      |  8 +-
 9 files changed, 97 insertions(+), 87 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index 3b424cc21e..00101b01ec 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -35,7 +35,7 @@ import org.sleuthkit.datamodel.TskCoreException;
  * Extracts text from artifacts by concatenating the values of all of the
  * artifact's attributes.
  */
-class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
+class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
 
     static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
 
@@ -49,14 +49,16 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
         logger.log(Level.WARNING, msg, ex); //NON-NLS  }
     }
 
-    private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException {
+    private InputStream getInputStream(Content artifact) throws TextExtractorException {
+                        BlackboardArtifact art = (BlackboardArtifact)artifact;
+
         // Concatenate the string values of all attributes into a single
         // "content" string to be indexed.
         StringBuilder artifactContents = new StringBuilder();
 
         Content dataSource = null;
         try {
-            dataSource = artifact.getDataSource();
+            dataSource = art.getDataSource();
         } catch (TskCoreException tskCoreException) {
             throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
         }
@@ -65,7 +67,7 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
         }
 
         try {
-            for (BlackboardAttribute attribute : artifact.getAttributes()) {
+            for (BlackboardAttribute attribute : art.getAttributes()) {
                 artifactContents.append(attribute.getAttributeType().getDisplayName());
                 artifactContents.append(" : ");
                 // We have also discussed modifying BlackboardAttribute.getDisplayString()
@@ -90,21 +92,33 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
     }
 
     @Override
-    public Reader getReader(BlackboardArtifact source) throws TextExtractorException {
+    public Reader getReader(Content source) throws TextExtractorException {
         return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
     }
 
     @Override
-    public long getID(BlackboardArtifact source) {
-        return source.getArtifactID();
+    public long getID(Content source) {
+        BlackboardArtifact art = (BlackboardArtifact)source;
+        return art.getArtifactID();
     }
 
     @Override
-    public String getName(BlackboardArtifact source) {
-        return source.getDisplayName() + "_" + source.getArtifactID();
+    public String getName(Content source) {
+                BlackboardArtifact art = (BlackboardArtifact)source;
+        return art.getDisplayName() + "_" + art.getArtifactID();
     }
 
     @Override
     public void setExtractionSettings(ExtractionContext context) {
     }
+
+    @Override
+    public boolean isContentTypeSpecific() {
+        return true;
+    }
+
+    @Override
+    public boolean isSupported(Content file, String detectedFormat) {
+        return true;
+    }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
index c97f93c06e..52713facc8 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
@@ -26,8 +26,9 @@ import org.sleuthkit.datamodel.Content;
 /**
  * Common methods for utilities that extract text and content and divide into
  * chunks
+ * @param <T>
  */
-public abstract class ContentTextExtractor implements TextExtractor<Content> {
+public abstract class ContentTextExtractor<T extends Content> implements TextExtractor<T> {
     
     //Mimetype groups to aassist extractor implementations in ignoring binary and 
     //archive files.
@@ -93,7 +94,7 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
      *
      * @return true if the file content is supported, false otherwise
      */
-    public abstract boolean isSupported(Content file, String detectedFormat);
+    public abstract boolean isSupported(T file, String detectedFormat);
 
     /**
      * Returns a reader that will iterate over the text of the source content.
@@ -103,7 +104,7 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
      * @throws TextExtractorException Error encountered during extraction
      */
     @Override
-    public abstract Reader getReader(Content source) throws TextExtractorException;
+    public abstract Reader getReader(T source) throws TextExtractorException;
 
     /**
      * Get the object id of the content source.
@@ -112,9 +113,9 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
      * @return object id associated with this source content
      */
     @Override
-    public long getID(Content source) {
+    public long getID(T source) {
         return source.getId();
-}
+    }
 
     /**
      * Returns the human-readable name of the given content source.
@@ -123,7 +124,7 @@ public abstract class ContentTextExtractor implements TextExtractor<Content> {
      * @return name of source content
      */
     @Override
-    public String getName(Content source) {
+    public String getName(T source) {
         return source.getName();
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index 3aa61700f5..3fb1ba2d1d 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -32,13 +32,14 @@ import net.htmlparser.jericho.Source;
 import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
  * Extracts text from HTML content.
  */
-final class HtmlTextExtractor extends ContentTextExtractor {
+final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T> {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
     private final int MAX_SIZE;
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index ec7cce0a85..a8f2ccaec0 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -18,7 +18,6 @@
  */
 package org.sleuthkit.autopsy.textextractors;
 
-import com.google.common.io.CharSource;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Iterator;
@@ -28,8 +27,8 @@ import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
 
 /**
  * Extracts text from SQLite database files.
@@ -40,10 +39,22 @@ import org.sleuthkit.datamodel.AbstractFile;
  *  2) Tables that contain spaces in their name are not extracted
  *  3) Table names are not included in its output text
  */
-final class SqliteTextExtractor extends ContentTextExtractor {
+final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<T> {
 
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
+    private static boolean isDisabled;
+    
+    static {
+        try {
+            Class.forName("org.sqlite.JDBC");
+            isDisabled = false;
+        } catch (ClassNotFoundException ex) {
+            logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
+                + "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
+            isDisabled = true;
+        }
+    }
 
     /**
      * This extractor only works for sqlite files, so it is indeed content type
@@ -63,14 +74,7 @@ final class SqliteTextExtractor extends ContentTextExtractor {
      */
     @Override
     public boolean isDisabled() {
-        try {
-            Class.forName("org.sqlite.JDBC");
-            return false;
-        } catch (ClassNotFoundException ex) {
-            logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
-                    + "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
-            return true;
-        }
+        return isDisabled;
     }
 
     @Override
@@ -102,16 +106,11 @@ final class SqliteTextExtractor extends ContentTextExtractor {
      */
     @Override
     public Reader getReader(Content source) throws TextExtractorException {
-        //Firewall for any content that is not an AbstractFile
-        if (!AbstractFile.class.isInstance(source)) {
-            try {
-                return CharSource.wrap("").openStream();
-            } catch (IOException ex) {
-                throw new TextExtractorException("Could not open CharSource stream", ex);
-            }
+        if(source instanceof AbstractFile) {
+            return new SQLiteStreamReader((AbstractFile)source);
         }
-
-        return new SQLiteStreamReader((AbstractFile) source);
+        throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type"
+                + " AbstractFile.", source.getName(), source.getId()));
     }
 
     /**
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 2df9ce4c31..ba7b913178 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-final class StringsTextExtractor extends ContentTextExtractor {
+final class StringsTextExtractor<T extends Content> extends ContentTextExtractor<T> {
 
     static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private boolean extractUTF8;
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index 112fc7a44b..c55721b634 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -18,42 +18,30 @@
  */
 package org.sleuthkit.autopsy.textextractors;
 
-import com.google.common.collect.ImmutableList;
-import java.util.logging.Level;
-import org.sleuthkit.autopsy.coreutils.Logger;
+import java.util.Arrays;
+import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.Report;
 
 /**
- * Factory for creating text extractors given a source file and a mimetype.
+ * Factory for creating text extractors given a source file
  *
- * See TextExtractor interface for the generic structure of such extractors.
+ * See ContentTextExtractor interface for the generic structure of such
+ * extractors.
  */
 public class TextExtractorFactory {
 
-    private static final Logger logger = Logger.getLogger(TextExtractorFactory.class.getName());
-
     /**
-     * The order of these extractors is important. It is a must that more
-     * specialized solutions are placed before the TikaTextExtractor to ensure
-     * these solutions are chosen over Tika.
-     */
-    private static final ImmutableList<Class<?>> extractors
-            = ImmutableList.of(HtmlTextExtractor.class,
-                    SqliteTextExtractor.class,
-                    TikaTextExtractor.class);
-
-    /**
-     * Auto detects the correct text extractor given the file and mimetype.
+     * Auto detects the correct text extractor given the file.
      *
-     * TextExtractors can be configured using the ExtractionContext object.
-     * Passing in null or a new unmodified instance of ExtractionContext will
-     * keep the extractors at default settings. Refer to the extractionconfigs
-     * package for available file configurations.
+     * ContentTextExtractor can be configured using the ExtractionContext
+     * object. Passing in null or a new unmodified instance of ExtractionContext
+     * will keep the extractors at default settings. Refer to the
+     * extractionconfigs package for available file configurations.
      *
-     * @param <T>
+     * @param <T>     Type of source content
      * @param file    Content source that will be read from
      * @param context Contains extraction configurations for certain file types
      *
@@ -65,32 +53,25 @@ public class TextExtractorFactory {
      *                                             have no corresponding
      *                                             extractor
      */
-    public static <T extends Content> TextExtractor<T> getContentSpecificExtractor(T file,
+    public static <T extends Content> ContentTextExtractor<T> getContentSpecificExtractor(T file,
             ExtractionContext context) throws NoContentSpecificExtractorException {
         if (file instanceof AbstractFile) {
+            List<ContentTextExtractor<T>> fileExtractors = getAbstractFileExtractors();
             String mimeType = ((AbstractFile) file).getMIMEType();
-            for (Class<?> candidate : extractors) {
-                try {
-                    ContentTextExtractor newInstance = (ContentTextExtractor) candidate.newInstance();
-                    newInstance.setExtractionSettings(context);
-                    if (newInstance.isSupported(file, mimeType)) {
-                        return (TextExtractor<T>) newInstance;
-                    }
-                } catch (SecurityException | InstantiationException | IllegalAccessException
-                        | IllegalArgumentException ex) {
-                    logger.log(Level.SEVERE, String.format("Could not instantiate ContentTextExtractor "
-                            + "instance for file %s, objId=%d and mimeType=%s", file.getName(),
-                            file.getId(), mimeType), ex);
+            for (ContentTextExtractor<T> candidate : fileExtractors) {
+                candidate.setExtractionSettings(context);
+                if (candidate.isSupported(file, mimeType)) {
+                    return candidate;
                 }
             }
         } else if (file instanceof BlackboardArtifact) {
-            TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor();
+            ContentTextExtractor<T> artifactExtractor = new ArtifactTextExtractor<>();
             artifactExtractor.setExtractionSettings(context);
-            return (TextExtractor<T>) artifactExtractor;
+            return artifactExtractor;
         } else if (file instanceof Report) {
-            TextExtractor<T> tikaExtractor = (TextExtractor<T>) new TikaTextExtractor();
-            tikaExtractor.setExtractionSettings(context);
-            return tikaExtractor;
+            ContentTextExtractor<T> reportExtractor = new TikaTextExtractor<>();
+            reportExtractor.setExtractionSettings(context);
+            return reportExtractor;
         }
 
         throw new NoContentSpecificExtractorException(
@@ -101,6 +82,20 @@ public class TextExtractorFactory {
         );
     }
 
+    /**
+     * Instantiates and returns a list of all of the known abstract file
+     * extractors.
+     *
+     * @return A list of specialized ContentTextExtractors
+     */
+    private static <T extends Content> List<ContentTextExtractor<T>> getAbstractFileExtractors() {
+        return Arrays.asList(
+                new HtmlTextExtractor<>(),
+                new SqliteTextExtractor<>(),
+                new TikaTextExtractor<>()
+        );
+    }
+
     /**
      * Returns the default extractor that can be run on any content type. This
      * extractor should be used as a backup in the event that no specialized
@@ -108,10 +103,10 @@ public class TextExtractorFactory {
      *
      * @param context Contains extraction configurations for certain file types
      *
-     * @return A StringsTextExtractor instance
+     * @return A DefaultExtractor instance
      */
-    public static TextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
-        TextExtractor<Content> stringsInstance = new StringsTextExtractor();
+    public static ContentTextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
+        ContentTextExtractor<Content> stringsInstance = new StringsTextExtractor<>();
         stringsInstance.setExtractionSettings(context);
         return stringsInstance;
     }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index f8053f324b..482ed398ab 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -54,7 +54,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * Extracts text from Tika supported content. Protects against Tika
  * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
  */
-final class TikaTextExtractor extends ContentTextExtractor {
+final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T> {
     
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
     
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index a2f0b108eb..753e672a87 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -448,7 +448,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
             
             try {
-                extractor = TextExtractorFactory.getContentSpecificExtractor(aFile, extractionContext);
+                extractor = TextExtractorFactory.getContentSpecificExtractor(aFile,extractionContext);
                 //divide into chunks and index
                 return Ingester.getDefault().indexText(extractor, aFile, context);
             } catch (TextExtractorFactory.NoContentSpecificExtractorException ex) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
index e36926f82f..5d9eda971d 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@@ -27,13 +27,13 @@ import org.apache.tika.parser.txt.CharsetMatch;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
 import org.sleuthkit.autopsy.textextractors.ExtractionContext;
-import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
  * Extract text from .txt files
  */
-final class TextFileExtractor extends ContentTextExtractor {
+final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
 
     //Set a Minimum confidence value to reject matches that may not have a valid text encoding
     //Values of valid text encodings were generally 100, xml code sometimes had a value around 50, 
@@ -49,12 +49,12 @@ final class TextFileExtractor extends ContentTextExtractor {
     }
 
     @Override
-    public boolean isSupported(Content file, String detectedFormat) {
+    public boolean isSupported(AbstractFile file, String detectedFormat) {
         return true;
     }
 
     @Override
-    public Reader getReader(Content source) throws TextExtractorException {
+    public Reader getReader(AbstractFile source) throws TextExtractorException {
         CharsetDetector detector = new CharsetDetector();
         //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
         InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));

From ece50a3a00b18bc4f4311be1fb7eb0b4ad75bad4 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 7 Dec 2018 13:26:52 -0500
Subject: [PATCH 11/18] Complete overhaul of how extractors are currently
 implemented, hopefully converging to a more sensible solution

---
 .../textextractors/ArtifactTextExtractor.java |  56 +++-----
 .../textextractors/ContentTextExtractor.java  | 130 ------------------
 .../textextractors/HtmlTextExtractor.java     |  38 +----
 .../textextractors/SqliteTextExtractor.java   |  53 +------
 .../textextractors/StringsTextExtractor.java  |  63 +--------
 .../autopsy/textextractors/TextExtractor.java |  69 +++-------
 ...tExtractorFactory.java => TextReader.java} |  83 ++++++-----
 .../textextractors/TikaTextExtractor.java     |  96 +++++++------
 .../autopsy/keywordsearch/Ingester.java       |  29 ++--
 .../KeywordSearchIngestModule.java            |  72 +++++++---
 .../keywordsearch/SolrSearchService.java      |  37 ++---
 .../keywordsearch/TextFileExtractor.java      |  45 ++----
 12 files changed, 241 insertions(+), 530 deletions(-)
 delete mode 100644 Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
 rename Core/src/org/sleuthkit/autopsy/textextractors/{TextExtractorFactory.java => TextReader.java} (61%)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index 00101b01ec..0cca74aef7 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -22,7 +22,6 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
-import java.util.logging.Level;
 import org.apache.commons.io.IOUtils;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
@@ -35,39 +34,27 @@ import org.sleuthkit.datamodel.TskCoreException;
  * Extracts text from artifacts by concatenating the values of all of the
  * artifact's attributes.
  */
-class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {
 
     static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
 
-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
-
-    private InputStream getInputStream(Content artifact) throws TextExtractorException {
-                        BlackboardArtifact art = (BlackboardArtifact)artifact;
-
+    private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException {
         // Concatenate the string values of all attributes into a single
         // "content" string to be indexed.
         StringBuilder artifactContents = new StringBuilder();
 
         Content dataSource = null;
         try {
-            dataSource = art.getDataSource();
+            dataSource = artifact.getDataSource();
         } catch (TskCoreException tskCoreException) {
-            throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
+            throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
         }
         if (dataSource == null) {
-            throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
+            throw new InitReaderException("Datasource was null for artifact: " + artifact.toString());
         }
 
         try {
-            for (BlackboardAttribute attribute : art.getAttributes()) {
+            for (BlackboardAttribute attribute : artifact.getAttributes()) {
                 artifactContents.append(attribute.getAttributeType().getDisplayName());
                 artifactContents.append(" : ");
                 // We have also discussed modifying BlackboardAttribute.getDisplayString()
@@ -85,40 +72,31 @@ class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
                 artifactContents.append(System.lineSeparator());
             }
         } catch (TskCoreException tskCoreException) {
-            throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
+            throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
         }
 
         return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
     }
 
     @Override
-    public Reader getReader(Content source) throws TextExtractorException {
+    public Reader getReader(BlackboardArtifact source) throws InitReaderException {
         return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
     }
 
-    @Override
-    public long getID(Content source) {
-        BlackboardArtifact art = (BlackboardArtifact)source;
-        return art.getArtifactID();
-    }
-
-    @Override
-    public String getName(Content source) {
-                BlackboardArtifact art = (BlackboardArtifact)source;
-        return art.getDisplayName() + "_" + art.getArtifactID();
-    }
-
+    /**
+     * Configures this extractors to the settings stored in relevant config instances.
+     * 
+     * This operation is a no-op since currently there are no configurable settings
+     * of the extraction process.
+     *
+     * @param context Instance containing file config settings
+     */
     @Override
     public void setExtractionSettings(ExtractionContext context) {
     }
 
     @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
-    @Override
-    public boolean isSupported(Content file, String detectedFormat) {
+    public boolean isSupported(BlackboardArtifact file, String detectedFormat) {
         return true;
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
deleted file mode 100644
index 52713facc8..0000000000
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011-2018 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.textextractors;
-
-import com.google.common.collect.ImmutableList;
-import java.io.Reader;
-import java.util.List;
-import org.sleuthkit.datamodel.Content;
-
-/**
- * Common methods for utilities that extract text and content and divide into
- * chunks
- * @param <T>
- */
-public abstract class ContentTextExtractor<T extends Content> implements TextExtractor<T> {
-    
-    //Mimetype groups to aassist extractor implementations in ignoring binary and 
-    //archive files.
-    public static final List<String> BINARY_MIME_TYPES
-            = ImmutableList.of(
-                    //ignore binary blob data, for which string extraction will be used
-                    "application/octet-stream", //NON-NLS
-                    "application/x-msdownload"); //NON-NLS
-
-    /** generally text extractors should ignore archives and let unpacking
-     * modules take care of them */
-    public static final List<String> ARCHIVE_MIME_TYPES
-            = ImmutableList.of(
-                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
-                    "application/x-7z-compressed", //NON-NLS
-                    "application/x-ace-compressed", //NON-NLS
-                    "application/x-alz-compressed", //NON-NLS
-                    "application/x-arj", //NON-NLS
-                    "application/vnd.ms-cab-compressed", //NON-NLS
-                    "application/x-cfs-compressed", //NON-NLS
-                    "application/x-dgc-compressed", //NON-NLS
-                    "application/x-apple-diskimage", //NON-NLS
-                    "application/x-gca-compressed", //NON-NLS
-                    "application/x-dar", //NON-NLS
-                    "application/x-lzx", //NON-NLS
-                    "application/x-lzh", //NON-NLS
-                    "application/x-rar-compressed", //NON-NLS
-                    "application/x-stuffit", //NON-NLS
-                    "application/x-stuffitx", //NON-NLS
-                    "application/x-gtar", //NON-NLS
-                    "application/x-archive", //NON-NLS
-                    "application/x-executable", //NON-NLS
-                    "application/x-gzip", //NON-NLS
-                    "application/zip", //NON-NLS
-                    "application/x-zoo", //NON-NLS
-                    "application/x-cpio", //NON-NLS
-                    "application/x-shar", //NON-NLS
-                    "application/x-tar", //NON-NLS
-                    "application/x-bzip", //NON-NLS
-                    "application/x-bzip2", //NON-NLS
-                    "application/x-lzip", //NON-NLS
-                    "application/x-lzma", //NON-NLS
-                    "application/x-lzop", //NON-NLS
-                    "application/x-z", //NON-NLS
-                    "application/x-compress"); //NON-NLS
-    
-    /**
-     * Determines if the extractor works only for specified types is
-     * supportedTypes() or whether is a generic content extractor (such as
-     * string extractor)
-     *
-     * @return
-     */
-    public abstract boolean isContentTypeSpecific();
-
-    /**
-     * Determines if the file content is supported by the extractor if
-     * isContentTypeSpecific() returns true.
-     *
-     * @param file           to test if its content should be supported
-     * @param detectedFormat mime-type with detected format (such as text/plain)
-     *                       or null if not detected
-     *
-     * @return true if the file content is supported, false otherwise
-     */
-    public abstract boolean isSupported(T file, String detectedFormat);
-
-    /**
-     * Returns a reader that will iterate over the text of the source content.
-     * 
-     * @param source Content source to read
-     * @return A reader that contains all source text
-     * @throws TextExtractorException Error encountered during extraction
-     */
-    @Override
-    public abstract Reader getReader(T source) throws TextExtractorException;
-
-    /**
-     * Get the object id of the content source.
-     * 
-     * @param source source content
-     * @return object id associated with this source content
-     */
-    @Override
-    public long getID(T source) {
-        return source.getId();
-    }
-
-    /**
-     * Returns the human-readable name of the given content source.
-     * 
-     * @param source source content
-     * @return name of source content
-     */
-    @Override
-    public String getName(T source) {
-        return source.getName();
-    }
-}
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index 3fb1ba2d1d..92db14a837 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -23,7 +23,6 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
-import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Config;
 import net.htmlparser.jericho.LoggerProvider;
@@ -33,13 +32,12 @@ import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
  * Extracts text from HTML content.
  */
-final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
     private final int MAX_SIZE;
@@ -67,19 +65,6 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
         MAX_SIZE = 50_000_000;
     }
 
-    /**
-     * Determines if this extractor is responsible for extracting only a
-     * specific type of media.
-     *
-     * In this case, only HTML documents can be read successfully.
-     *
-     * @return true
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
     /**
      * Determines if this content type is supported by this extractor.
      *
@@ -89,7 +74,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
      * @return flag indicating support
      */
     @Override
-    public boolean isSupported(Content content, String detectedFormat) {
+    public boolean isSupported(AbstractFile content, String detectedFormat) {
         return detectedFormat != null
                 && WEB_MIME_TYPES.contains(detectedFormat)
                 && content.getSize() <= MAX_SIZE;
@@ -105,7 +90,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
      * @throws TextExtractorException
      */
     @Override
-    public Reader getReader(Content content) throws TextExtractorException {
+    public Reader getReader(AbstractFile content) throws InitReaderException {
         //TODO JIRA-4467, there is only harm in excluding HTML documents greater
         //than 50MB due to our troubled approach of extraction.
         ReadContentInputStream stream = new ReadContentInputStream(content);
@@ -201,25 +186,10 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
             // All done, now make it a reader
             return new StringReader(stringBuilder.toString());
         } catch (IOException ex) {
-            throw new TextExtractorException("Error extracting HTML from content.", ex);
+            throw new InitReaderException("Error extracting HTML from content.", ex);
         }
     }
 
-    /**
-     * Indicates if this extractor can run.
-     *
-     * @return Flag indicating if this extractor can run.
-     */
-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
-
     /**
      * Determines how the extraction process will proceed given the settings 
      * stored in this context instance.
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index a8f2ccaec0..09e8aa82eb 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
 
 /**
  * Extracts text from SQLite database files.
@@ -39,48 +38,10 @@ import org.sleuthkit.datamodel.Content;
  *  2) Tables that contain spaces in their name are not extracted
  *  3) Table names are not included in its output text
  */
-final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
 
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
-    private static boolean isDisabled;
-    
-    static {
-        try {
-            Class.forName("org.sqlite.JDBC");
-            isDisabled = false;
-        } catch (ClassNotFoundException ex) {
-            logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
-                + "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
-            isDisabled = true;
-        }
-    }
-
-    /**
-     * This extractor only works for sqlite files, so it is indeed content type
-     * specific. 
-     * 
-     * @return true
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
-    /**
-     * Determines if this extractor is fit to run.
-     * 
-     * @return Flag indicating if it should or shouldn't be run.
-     */
-    @Override
-    public boolean isDisabled() {
-        return isDisabled;
-    }
-
-    @Override
-    public void logWarning(String msg, Exception exception) {
-        logger.log(Level.WARNING, msg, exception); //NON-NLS
-    }
 
     /**
      * Supports only the sqlite mimetypes
@@ -91,7 +52,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
      * @return true if x-sqlite3
      */
     @Override
-    public boolean isSupported(Content file, String detectedFormat) {
+    public boolean isSupported(AbstractFile file, String detectedFormat) {
         return SQLITE_MIMETYPE.equals(detectedFormat);
     }
 
@@ -105,12 +66,8 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
      * @throws TextExtractorException
      */
     @Override
-    public Reader getReader(Content source) throws TextExtractorException {
-        if(source instanceof AbstractFile) {
-            return new SQLiteStreamReader((AbstractFile)source);
-        }
-        throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type"
-                + " AbstractFile.", source.getName(), source.getId()));
+    public Reader getReader(AbstractFile source) throws InitReaderException {
+        return new SQLiteStreamReader(source);
     }
 
     /**
@@ -125,7 +82,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
     @Override
     public void setExtractionSettings(ExtractionContext context) {
     }
-
+    
     /**
      * Produces a continuous stream of characters from a database file. To
      * achieve this, all table names are queues up and a SQLiteTableReader is
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index ba7b913178..8fc5b3bdeb 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -25,7 +25,6 @@ import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
-import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@@ -37,41 +36,12 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-final class StringsTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class StringsTextExtractor {
 
-    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
     private boolean extractUTF8;
     private boolean extractUTF16;
     private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
 
-    /**
-     * Determines if this extractor may only read particular types of content.
-     *
-     * Since Strings may be run on any content type, it is not content specific.
-     *
-     * @return false
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return false;
-    }
-
-    /**
-     * Determines if this extractor can read the content type.
-     *
-     * Note: Strings can be run on any type of content, so all types will return
-     * true.
-     *
-     * @param file           Content source to read
-     * @param detectedFormat Mimetype of source file.
-     *
-     * @return true
-     */
-    @Override
-    public boolean isSupported(Content file, String detectedFormat) {
-        return true;
-    }
-
     private final List<SCRIPT> extractScripts = new ArrayList<>();
 
     /**
@@ -99,33 +69,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
         this.extractScripts.addAll(extractScripts);
     }
 
-    /**
-     * Get the currently used scripts for extraction
-     *
-     * @return scripts currently used or null if not supported
-     */
-    public List<SCRIPT> getScripts() {
-        return new ArrayList<>(extractScripts);
-    }
-
-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
-    }
-
-    /**
-     * Determines if this extractor should be run or not.
-     *
-     * Atleast one of the extraction encodings in DefaultExtractionConfig must
-     * be set for this extractor to run.
-     *
-     * @return Flag indicating if this extractor should be run.
-     */
-    @Override
-    public boolean isDisabled() {
-        return extractUTF8 == false && extractUTF16 == false;
-    }
-
     /**
      * Returns a reader that will iterate over the text of the content source.
      *
@@ -136,8 +79,7 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
      * @throws
      * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
-    @Override
-    public InputStreamReader getReader(Content content) throws TextExtractorException {
+    public InputStreamReader getReader(Content content) {
         InputStream stringStream = getInputStream(content);
         return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
     }
@@ -160,7 +102,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
      *
      * @param context Instance containing config classes
      */
-    @Override
     public void setExtractionSettings(ExtractionContext context) {
         if (context != null && context.contains(DefaultExtractionConfig.class)) {
             DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index 34ec0e147b..743be1ce41 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -19,7 +19,6 @@
 package org.sleuthkit.autopsy.textextractors;
 
 import java.io.Reader;
-import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 
 /**
  * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
@@ -28,23 +27,19 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
  * @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
  *            process.
  */
-public interface TextExtractor<T extends SleuthkitVisitableItem> {
-
-    /**
-     * Is this extractor configured such that no extraction will/should be done?
+interface TextExtractor<T> {
+    
+     /**
+     * Determines if the file content is supported by the extractor if
+     * isContentTypeSpecific() returns true.
      *
-     * @return True if this extractor will/should not perform any extraction.
-     */
-    boolean isDisabled();
-
-    /**
-     * Log the given message and exception as a warning.
+     * @param file           to test if its content should be supported
+     * @param detectedFormat mime-type with detected format (such as text/plain)
+     *                       or null if not detected
      *
-     * @param msg Log message
-     * @param ex  Exception associated with the incoming message
+     * @return true if the file content is supported, false otherwise
      */
-    void logWarning(String msg, Exception ex);
-
+    public abstract boolean isSupported(T file, String detectedFormat);
     /**
      * Get a reader that will iterate over the text extracted from the given
      * source.
@@ -53,28 +48,8 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      *
      * @return Reader instance that contains the text of the source
      *
-     * @throws TextExtractorException
      */
-    Reader getReader(T source) throws TextExtractorException;
-       
-    /**
-     * Get the 'object' id of the given source.
-     *
-     * @param source Source content of type T
-     *
-     * @return Object id of the source content
-     */
-    long getID(T source);
-
-    /**
-     * Get a human readable name for the given source.
-     *
-     * @param source Source content of type T
-     *
-     * @return Name of the content source
-     */
-    String getName(T source);
-    
+    Reader getReader(T source) throws InitReaderException;
        
     /**
      * Determines how the extraction process will proceed given the settings 
@@ -85,18 +60,18 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
      * @param context Instance containing file config classes
      */
     void setExtractionSettings(ExtractionContext context);
-
-    /**
-     * System exception for dealing with errors encountered during extraction.
-     */
-    class TextExtractorException extends Exception {
-
-        public TextExtractorException(String message) {
-            super(message);
+    
+    public class InitReaderException extends Exception {
+        public InitReaderException(String msg, Throwable ex) {
+            super(msg, ex);
         }
-
-        public TextExtractorException(String message, Throwable cause) {
-            super(message, cause);
+        
+        public InitReaderException(Throwable ex) {
+            super(ex);
+        }
+        
+        public InitReaderException(String msg) {
+            super(msg);
         }
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
similarity index 61%
rename from Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
index c55721b634..f4413d3f74 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
@@ -18,6 +18,7 @@
  */
 package org.sleuthkit.autopsy.textextractors;
 
+import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
@@ -31,8 +32,13 @@ import org.sleuthkit.datamodel.Report;
  * See ContentTextExtractor interface for the generic structure of such
  * extractors.
  */
-public class TextExtractorFactory {
-
+public class TextReader {
+    
+    private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
+                new HtmlTextExtractor<>(),
+                new SqliteTextExtractor<>(),
+                new TikaTextExtractor<>()
+        );
     /**
      * Auto detects the correct text extractor given the file.
      *
@@ -41,40 +47,42 @@ public class TextExtractorFactory {
      * will keep the extractors at default settings. Refer to the
      * extractionconfigs package for available file configurations.
      *
-     * @param <T>     Type of source content
      * @param file    Content source that will be read from
      * @param context Contains extraction configurations for certain file types
      *
      * @return A ContentTextExtractor instance that is properly configured and
      *         can be read from the getReader() method.
      *
-     * @throws NoContentSpecificExtractorException In the event that the
+     * @throws NoReaderFoundException In the event that the
      *                                             inputted file and mimetype
      *                                             have no corresponding
      *                                             extractor
      */
-    public static <T extends Content> ContentTextExtractor<T> getContentSpecificExtractor(T file,
-            ExtractionContext context) throws NoContentSpecificExtractorException {
-        if (file instanceof AbstractFile) {
-            List<ContentTextExtractor<T>> fileExtractors = getAbstractFileExtractors();
-            String mimeType = ((AbstractFile) file).getMIMEType();
-            for (ContentTextExtractor<T> candidate : fileExtractors) {
-                candidate.setExtractionSettings(context);
-                if (candidate.isSupported(file, mimeType)) {
-                    return candidate;
+    public static Reader getContentSpecificReader(Content file,
+            ExtractionContext context) throws NoReaderFoundException {
+        try {
+            if (file instanceof AbstractFile) {
+                String mimeType = ((AbstractFile) file).getMIMEType();
+                for (TextExtractor<AbstractFile> candidate : fileExtractors) {
+                    candidate.setExtractionSettings(context);
+                    if (candidate.isSupported((AbstractFile)file, mimeType)) {
+                        return candidate.getReader((AbstractFile)file);
+                    }
                 }
+            } else if (file instanceof BlackboardArtifact) {
+                TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor<>();
+                artifactExtractor.setExtractionSettings(context);
+                return artifactExtractor.getReader((BlackboardArtifact)file);
+            } else if (file instanceof Report) {
+                TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
+                reportExtractor.setExtractionSettings(context);
+                reportExtractor.getReader((Report)file);
             }
-        } else if (file instanceof BlackboardArtifact) {
-            ContentTextExtractor<T> artifactExtractor = new ArtifactTextExtractor<>();
-            artifactExtractor.setExtractionSettings(context);
-            return artifactExtractor;
-        } else if (file instanceof Report) {
-            ContentTextExtractor<T> reportExtractor = new TikaTextExtractor<>();
-            reportExtractor.setExtractionSettings(context);
-            return reportExtractor;
+        } catch (TextExtractor.InitReaderException ex) {
+            throw new NoReaderFoundException(ex);
         }
-
-        throw new NoContentSpecificExtractorException(
+        
+        throw new NoReaderFoundException(
                 String.format("Could not find a suitable extractor for "
                         + "file with name [%s] and id=[%d]. Try using the default, "
                         + "non content specific extractor as an alternative.",
@@ -82,43 +90,34 @@ public class TextExtractorFactory {
         );
     }
 
-    /**
-     * Instantiates and returns a list of all of the known abstract file
-     * extractors.
-     *
-     * @return A list of specialized ContentTextExtractors
-     */
-    private static <T extends Content> List<ContentTextExtractor<T>> getAbstractFileExtractors() {
-        return Arrays.asList(
-                new HtmlTextExtractor<>(),
-                new SqliteTextExtractor<>(),
-                new TikaTextExtractor<>()
-        );
-    }
-
     /**
      * Returns the default extractor that can be run on any content type. This
      * extractor should be used as a backup in the event that no specialized
      * extractor can be found.
      *
+     * @param source
      * @param context Contains extraction configurations for certain file types
      *
      * @return A DefaultExtractor instance
      */
-    public static ContentTextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
-        ContentTextExtractor<Content> stringsInstance = new StringsTextExtractor<>();
+    public static Reader getDefaultReader(Content source, ExtractionContext context) {
+        StringsTextExtractor stringsInstance = new StringsTextExtractor();
         stringsInstance.setExtractionSettings(context);
-        return stringsInstance;
+        return stringsInstance.getReader(source);
     }
 
     /**
      * System level exception for handling content types that have no specific
      * strategy defined for extracting their text.
      */
-    public static class NoContentSpecificExtractorException extends Exception {
+    public static class NoReaderFoundException extends Exception {
 
-        public NoContentSpecificExtractorException(String msg) {
+        public NoReaderFoundException(String msg) {
             super(msg);
         }
+        
+        public NoReaderFoundException(Throwable ex) {
+            super(ex);
+        }
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 482ed398ab..200a8618fa 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -18,6 +18,7 @@
  */
 package org.sleuthkit.autopsy.textextractors;
 
+import com.google.common.collect.ImmutableList;
 import com.google.common.io.CharSource;
 import java.io.File;
 import java.io.IOException;
@@ -54,7 +55,53 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * Extracts text from Tika supported content. Protects against Tika
  * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
  */
-final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T> {
+final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
+    
+      //Mimetype groups to aassist extractor implementations in ignoring binary and 
+    //archive files.
+    private static final List<String> BINARY_MIME_TYPES
+            = ImmutableList.of(
+                    //ignore binary blob data, for which string extraction will be used
+                    "application/octet-stream", //NON-NLS
+                    "application/x-msdownload"); //NON-NLS
+
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
+    private static final List<String> ARCHIVE_MIME_TYPES
+            = ImmutableList.of(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+
     
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
     
@@ -74,11 +121,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
                     .map(mt -> mt.getType() + "/" + mt.getSubtype())
                     .collect(Collectors.toList());
 
-    @Override
-    public void logWarning(final String msg, Exception ex) {
-        tikaLogger.log(Level.WARNING, msg, ex);
-    }
-
     /**
      * Returns a reader that will iterate over the text extracted from Apache 
      * Tika. 
@@ -89,7 +131,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
      * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
      */
     @Override
-    public Reader getReader(Content content) throws TextExtractorException {
+    public Reader getReader(Content content) throws InitReaderException {
         ReadContentInputStream stream = new ReadContentInputStream(content);
 
         Metadata metadata = new Metadata();
@@ -136,7 +178,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
             PushbackReader pushbackReader = new PushbackReader(tikaReader);
             int read = pushbackReader.read();
             if (read == -1) {
-                throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
+                throw new InitReaderException("Unable to extract text: Tika returned empty reader for " + content);
             }
             pushbackReader.unread(read);
 
@@ -145,15 +187,13 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
             return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
         } catch (TimeoutException te) {
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
-            logWarning(msg, te);
-            throw new TextExtractorException(msg, te);
-        } catch (TextExtractorException ex) {
+            throw new InitReaderException(msg, te);
+        } catch (InitReaderException ex) {
             throw ex;
         } catch (Exception ex) {
             tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
-            logWarning(msg, ex);
-            throw new TextExtractorException(msg, ex);
+            throw new InitReaderException(msg, ex);
         } finally {
             future.cancel(true);
         }
@@ -199,19 +239,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
                         ));
     }
 
-    /**
-     * Determines if this extractor only understands a specifc type of content.
-     * 
-     * Although Apache Tika is defined for many input types, it is still a content
-     * specific approach to extraction.
-     * 
-     * @return true
-     */
-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
     /**
      * Determines if Tika is supported for this content type and mimetype.
      * 
@@ -222,8 +249,8 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
     @Override
     public boolean isSupported(Content content, String detectedFormat) {
         if (detectedFormat == null
-                || ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
-                || ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
+                || BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
+                || ARCHIVE_MIME_TYPES.contains(detectedFormat)
                 || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
                 || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
                 ) {
@@ -232,19 +259,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
     }
 
-    /**
-     * Determines if this extractor can be run.
-     * 
-     * So long as Tika's dependencies are present, this extractor can run 
-     * no matter the circumstance.
-     * 
-     * @return true 
-     */
-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
     /**
      * Return timeout that should be used to index the content.
      *
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 2a669772d9..052dee796a 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -19,6 +19,7 @@
 package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.BufferedReader;
+import java.io.Reader;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.logging.Level;
@@ -32,7 +33,6 @@ import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
 import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
-import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
@@ -106,8 +106,8 @@ class Ingester {
      * @throws IngesterException if there was an error processing a specific
      *                           artifact, but the Solr server is probably fine.
      */
-    void indexMetaDataOnly(BlackboardArtifact artifact, TextExtractor<Content> extractor) throws IngesterException {
-        indexChunk("", extractor.getName(artifact), getContentFields(artifact));
+    void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
+        indexChunk("", sourceName, getContentFields(artifact));
     }
 
     /**
@@ -142,23 +142,12 @@ class Ingester {
      * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
      */
     // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients 
-    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
-        final long sourceID = extractor.getID(source);
-        final String sourceName = extractor.getName(source);
-
+    < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
         int numChunks = 0; //unknown until chunking is done
-
-        if (extractor.isDisabled()) {
-            /*
-             * some Extractors, notable the strings extractor, have options
-             * which can be configured such that no extraction should be done
-             */
-            return true;
-        }
-
+        
         Map<String, String> fields = getContentFields(source);
         //Get a reader for the content of the given source
-        try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
+        try (BufferedReader reader = new BufferedReader(sourceReader)) {
             Chunker chunker = new Chunker(reader);
             for (Chunk chunk : chunker) {
                 if (context != null && context.fileIngestIsCancelled()) {
@@ -173,18 +162,18 @@ class Ingester {
                     indexChunk(chunk.toString(), sourceName, fields);
                     numChunks++;
                 } catch (Ingester.IngesterException ingEx) {
-                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+                    logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
                             + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
 
                     throw ingEx; //need to rethrow to signal error and move on
                 }
             }
             if (chunker.hasException()) {
-                extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
+                logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
                 return false;
             }
         } catch (Exception ex) {
-            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
+            logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
             return false;
         } finally {
             if (context != null && context.fileIngestIsCancelled()) {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 753e672a87..9af9a04648 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -18,12 +18,14 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
-import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
+import com.google.common.collect.ImmutableList;
+import java.io.Reader;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
+import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
 import org.sleuthkit.autopsy.casemodule.Case;
@@ -37,16 +39,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
 import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
 import org.sleuthkit.autopsy.ingest.IngestServices;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
+import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.ExtractionContext;
-import org.sleuthkit.autopsy.textextractors.TextExtractor;
-import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.TextReader;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskData;
 import org.sleuthkit.datamodel.TskData.FileKnown;
 
@@ -68,6 +69,43 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
 })
 public final class KeywordSearchIngestModule implements FileIngestModule {
     
+    /** generally text extractors should ignore archives and let unpacking
+     * modules take care of them */
+    public static final List<String> ARCHIVE_MIME_TYPES
+            = ImmutableList.of(
+                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
+                    "application/x-7z-compressed", //NON-NLS
+                    "application/x-ace-compressed", //NON-NLS
+                    "application/x-alz-compressed", //NON-NLS
+                    "application/x-arj", //NON-NLS
+                    "application/vnd.ms-cab-compressed", //NON-NLS
+                    "application/x-cfs-compressed", //NON-NLS
+                    "application/x-dgc-compressed", //NON-NLS
+                    "application/x-apple-diskimage", //NON-NLS
+                    "application/x-gca-compressed", //NON-NLS
+                    "application/x-dar", //NON-NLS
+                    "application/x-lzx", //NON-NLS
+                    "application/x-lzh", //NON-NLS
+                    "application/x-rar-compressed", //NON-NLS
+                    "application/x-stuffit", //NON-NLS
+                    "application/x-stuffitx", //NON-NLS
+                    "application/x-gtar", //NON-NLS
+                    "application/x-archive", //NON-NLS
+                    "application/x-executable", //NON-NLS
+                    "application/x-gzip", //NON-NLS
+                    "application/zip", //NON-NLS
+                    "application/x-zoo", //NON-NLS
+                    "application/x-cpio", //NON-NLS
+                    "application/x-shar", //NON-NLS
+                    "application/x-tar", //NON-NLS
+                    "application/x-bzip", //NON-NLS
+                    "application/x-bzip2", //NON-NLS
+                    "application/x-lzip", //NON-NLS
+                    "application/x-lzma", //NON-NLS
+                    "application/x-lzop", //NON-NLS
+                    "application/x-z", //NON-NLS
+                    "application/x-compress"); //NON-NLS
+    
     /**
      * Options for this extractor
      */
@@ -104,7 +142,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private TextExtractor<Content> stringExtractor;
+    private ExtractionContext stringsExtractionContext;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
     private long jobId;
@@ -250,7 +288,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             }
         }
 
-        ExtractionContext extractionContext = new ExtractionContext();
+        stringsExtractionContext = new ExtractionContext();
         
         DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
         Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
@@ -258,9 +296,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
         stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
         
-        extractionContext.set(DefaultExtractionConfig.class, stringsConfig);
+        stringsExtractionContext.set(DefaultExtractionConfig.class, stringsConfig);
         
-        stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
         indexer = new Indexer();
         initialized = true;
     }
@@ -352,7 +389,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
      * Common cleanup code when module stops or final searcher completes
      */
     private void cleanup() {
-        stringExtractor = null;
+        stringsExtractionContext = null;
         initialized = false;
     }
 
@@ -440,7 +477,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            TextExtractor<Content> extractor = null;
             ExtractionContext extractionContext = new ExtractionContext();
             
             ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
@@ -448,10 +484,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
             
             try {
-                extractor = TextExtractorFactory.getContentSpecificExtractor(aFile,extractionContext);
+                Reader specializedReader = TextReader.getContentSpecificReader(aFile,extractionContext);
                 //divide into chunks and index
-                return Ingester.getDefault().indexText(extractor, aFile, context);
-            } catch (TextExtractorFactory.NoContentSpecificExtractorException ex) {
+                return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
+            } catch (TextReader.NoReaderFoundException ex) {
                 //No text extractor found... run the default instead
                 return false;
             }
@@ -470,7 +506,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 if (context.fileIngestIsCancelled()) {
                     return true;
                 }
-                if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
+                Reader stringsReader = TextReader.getDefaultReader(aFile, stringsExtractionContext);
+                if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                     return true;
                 } else {
@@ -530,7 +567,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
 
             // we skip archive formats that are opened by the archive module. 
             // @@@ We could have a check here to see if the archive module was enabled though...
-            if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
+            if (ARCHIVE_MIME_TYPES.contains(fileType)) {
                 try {
                     if (context.fileIngestIsCancelled()) {
                         return;
@@ -579,11 +616,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 //should be ignored by the TextFileExtractor because they may contain more than one text encoding
                 try {
                     TextFileExtractor textFileExtractor = new TextFileExtractor();
-                    if (Ingester.getDefault().indexText(textFileExtractor, aFile, context)) {
+                    Reader textReader = textFileExtractor.getReader(aFile);
+                    if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
                         putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
                         wasTextAdded = true;
                     }
-                } catch (IngesterException ex) {
+                } catch (IngesterException | TextFileExtractorException ex) {
                     logger.log(Level.WARNING, "Unable to index as unicode", ex);
                 }
             }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 5661c6b4e4..dc78e751b8 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
 
 import java.io.File;
 import java.io.IOException;
+import java.io.Reader;
 import java.lang.reflect.InvocationTargetException;
 import java.net.InetAddress;
 import java.util.ArrayList;
@@ -33,7 +34,6 @@ import org.apache.commons.lang.math.NumberUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
-import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
 import org.openide.util.lookup.ServiceProviders;
@@ -46,8 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
 import org.sleuthkit.autopsy.progress.ProgressIndicator;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.autopsy.textextractors.TextExtractor;
-import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.TextReader;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
@@ -115,22 +114,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
                 return;
             }
             try {
-                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
-                        .getContentSpecificExtractor(content, null);
-                ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
-                ingester.indexText(contentSpecificExtractor, artifact, null);
-            } catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
+                Reader blackboardReader = TextReader
+                        .getContentSpecificReader(content, null);
+                String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
+                ingester.indexMetaDataOnly(artifact, sourceName);
+                ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
+            } catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
                 throw new TskCoreException(ex.getCause().getMessage(), ex);
             }
         } else {
             try {
-                TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
-                        .getContentSpecificExtractor(content, null);
-                ingester.indexText(contentSpecificExtractor, content, null);
-            } catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
+                Reader contentReader = TextReader
+                        .getContentSpecificReader(content, null);
+                ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
+            } catch (TextReader.NoReaderFoundException | Ingester.IngesterException ex) {
                 try {
                     // Try the StringsTextExtractor if Tika extractions fails.
-                    ingester.indexText(TextExtractorFactory.getDefaultExtractor(null), content, null);
+                    ingester.indexText(TextReader.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
                 } catch (Ingester.IngesterException ex1) {
                     throw new TskCoreException(ex.getCause().getMessage(), ex1);
                 }
@@ -444,11 +444,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
         final Ingester ingester = Ingester.getDefault();
 
         try {
-            TextExtractor<Content> contentSpecificExtractor = 
-                    TextExtractorFactory.getContentSpecificExtractor((Content) artifact, null);
-            ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
-            ingester.indexText(contentSpecificExtractor, artifact, null);
-        } catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
+            String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
+            Reader contentSpecificReader = 
+                    TextReader.getContentSpecificReader((Content) artifact, null);
+            ingester.indexMetaDataOnly(artifact, sourceName);
+            ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
+        } catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
     }
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
index 5d9eda971d..117c9ad6e9 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@@ -21,19 +21,15 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.BufferedInputStream;
 import java.io.Reader;
-import java.util.logging.Level;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
-import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
-import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
  * Extract text from .txt files
  */
-final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
+final class TextFileExtractor {
 
     //Set a Minimum confidence value to reject matches that may not have a valid text encoding
     //Values of valid text encodings were generally 100, xml code sometimes had a value around 50, 
@@ -41,47 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
     //This limited information was used to select the current value as one that would filter out clearly non-text 
     //files while hopefully working on all files with a valid text encoding
     static final private int MIN_MATCH_CONFIDENCE = 20;
-    static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
 
-    @Override
-    public boolean isContentTypeSpecific() {
-        return true;
-    }
-
-    @Override
-    public boolean isSupported(AbstractFile file, String detectedFormat) {
-        return true;
-    }
-
-    @Override
-    public Reader getReader(AbstractFile source) throws TextExtractorException {
+    public Reader getReader(AbstractFile source) throws TextFileExtractorException {
         CharsetDetector detector = new CharsetDetector();
         //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
         InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
         try {
             detector.setText(stream);
         } catch (IOException ex) {
-            throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
+            throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
         }
         CharsetMatch match = detector.detect();
         if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
-            throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
+            throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
         }
 
         return match.getReader();
     }
-
-    @Override
-    public boolean isDisabled() {
-        return false;
-    }
-
-    @Override
-    public void logWarning(String msg, Exception ex) {
-        logger.log(Level.WARNING, msg, ex);
-    }
-
-    @Override
-    public void setExtractionSettings(ExtractionContext context) {
+    
+    public class TextFileExtractorException extends Exception {
+        public TextFileExtractorException(String msg, Throwable ex) {
+            super(msg, ex);
+        }
+        public TextFileExtractorException(String msg) {
+            super(msg);
+        }
     }
 }

From 76e5fc9f9ee6f00cbb42842b502dac7685caf4a5 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 7 Dec 2018 15:03:10 -0500
Subject: [PATCH 12/18] Internal changes to structure of extractors in Core,
 these classes are not public API

---
 .../textextractors/ArtifactTextExtractor.java | 27 +++--------------
 .../textextractors/HtmlTextExtractor.java     | 15 ++--------
 .../textextractors/SqliteTextExtractor.java   | 13 ---------
 .../textextractors/StringsTextExtractor.java  | 29 +++++++++++++++----
 .../autopsy/textextractors/TextExtractor.java | 26 ++++++++++++-----
 .../autopsy/textextractors/TextReader.java    |  9 +++---
 6 files changed, 52 insertions(+), 67 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index 0cca74aef7..070fa84231 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -18,12 +18,10 @@
  */
 package org.sleuthkit.autopsy.textextractors;
 
-import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import org.apache.commons.io.IOUtils;
-import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
@@ -36,9 +34,8 @@ import org.sleuthkit.datamodel.TskCoreException;
  */
 class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {
 
-    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
-
-    private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException {
+    @Override
+    public Reader getReader(BlackboardArtifact artifact) throws InitReaderException {
         // Concatenate the string values of all attributes into a single
         // "content" string to be indexed.
         StringBuilder artifactContents = new StringBuilder();
@@ -75,24 +72,8 @@ class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtract
             throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
         }
 
-        return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
-    }
-
-    @Override
-    public Reader getReader(BlackboardArtifact source) throws InitReaderException {
-        return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
-    }
-
-    /**
-     * Configures this extractors to the settings stored in relevant config instances.
-     * 
-     * This operation is a no-op since currently there are no configurable settings
-     * of the extraction process.
-     *
-     * @param context Instance containing file config settings
-     */
-    @Override
-    public void setExtractionSettings(ExtractionContext context) {
+        return new InputStreamReader(IOUtils.toInputStream(artifactContents,
+                StandardCharsets.UTF_8), StandardCharsets.UTF_8);
     }
 
     @Override
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index 92db14a837..fd511e2779 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -23,6 +23,7 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
+import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Config;
 import net.htmlparser.jericho.LoggerProvider;
@@ -186,20 +187,8 @@ final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T
             // All done, now make it a reader
             return new StringReader(stringBuilder.toString());
         } catch (IOException ex) {
+            logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
             throw new InitReaderException("Error extracting HTML from content.", ex);
         }
     }
-
-    /**
-     * Determines how the extraction process will proceed given the settings 
-     * stored in this context instance.
-     * 
-     * As of now, there are no configurable settings for the HtmlTextExtractor.
-     * See the extractionconfigs package for available file configurations.
-     * 
-     * @param context Instance containing config classes
-     */
-    @Override
-    public void setExtractionSettings(ExtractionContext context) {
-    }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index 09e8aa82eb..19bcd7ac87 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -69,19 +69,6 @@ final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor
     public Reader getReader(AbstractFile source) throws InitReaderException {
         return new SQLiteStreamReader(source);
     }
-
-    /**
-     * Determines how the extraction process will proceed given the settings 
-     * stored in this context instance.
-     * 
-     * As of now, there are no configurable settings for the SqliteTextExtractor.
-     * See the extractionconfigs package for available file configurations.
-     * 
-     * @param context Instance containing config classes
-     */
-    @Override
-    public void setExtractionSettings(ExtractionContext context) {
-    }
     
     /**
      * Produces a continuous stream of characters from a database file. To
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 8fc5b3bdeb..e4a605420b 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -25,7 +25,6 @@ import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
-import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
@@ -36,7 +35,7 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-final class StringsTextExtractor {
+final class StringsTextExtractor<T extends Content> implements TextExtractor<T> {
 
     private boolean extractUTF8;
     private boolean extractUTF16;
@@ -79,6 +78,7 @@ final class StringsTextExtractor {
      * @throws
      * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
+    @Override
     public InputStreamReader getReader(Content content) {
         InputStream stringStream = getInputStream(content);
         return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
@@ -94,7 +94,7 @@ final class StringsTextExtractor {
     }
 
     /**
-     * Determines how the extraction process will proceed given the settings 
+     * Determines how the extraction process will proceed given the settings
      * stored in this context instance.
      *
      * See the DefaultExtractionConfig class in the extractionconfigs package
@@ -102,6 +102,7 @@ final class StringsTextExtractor {
      *
      * @param context Instance containing config classes
      */
+    @Override
     public void setExtractionSettings(ExtractionContext context) {
         if (context != null && context.contains(DefaultExtractionConfig.class)) {
             DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
@@ -117,6 +118,26 @@ final class StringsTextExtractor {
         }
     }
 
+    /**
+     * 
+     * @param file
+     * @param detectedFormat
+     * @return 
+     */
+    @Override
+    public boolean isSupported(T file, String detectedFormat) {
+        return true;
+    }
+
+    /**
+     * 
+     * @return 
+     */
+    @Override
+    public boolean isEnabled() {
+        return extractUTF8 || extractUTF16;
+    }
+
     /**
      * Content input string stream reader/converter - given Content, extract
      * strings from it and return encoded bytes via read()
@@ -131,7 +152,6 @@ final class StringsTextExtractor {
      */
     private static class EnglishOnlyStream extends InputStream {
 
-        private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
         private static final String NLS = Character.toString((char) 10); //new line
         private static final int READ_BUF_SIZE = 65536;
         private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
@@ -346,7 +366,6 @@ final class StringsTextExtractor {
      */
     private static class InternationalStream extends InputStream {
 
-        private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
         private static final int FILE_BUF_SIZE = 1024 * 1024;
         private final Content content;
         private final byte[] oneCharBuf = new byte[1];
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index 743be1ce41..7cc05d7432 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -21,17 +21,14 @@ package org.sleuthkit.autopsy.textextractors;
 import java.io.Reader;
 
 /**
- * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
- * This Reader is given to the Ingester to chunk and index in Solr.
+ * Extracts text out of Objects and exposes it as a Reader.
  *
- * @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
- *            process.
+ * @param <T> Generic data type T
  */
 interface TextExtractor<T> {
     
      /**
-     * Determines if the file content is supported by the extractor if
-     * isContentTypeSpecific() returns true.
+     * Determines if the file content is supported by the extractor.
      *
      * @param file           to test if its content should be supported
      * @param detectedFormat mime-type with detected format (such as text/plain)
@@ -39,7 +36,15 @@ interface TextExtractor<T> {
      *
      * @return true if the file content is supported, false otherwise
      */
-    public abstract boolean isSupported(T file, String detectedFormat);
+    boolean isSupported(T file, String detectedFormat);
+    
+    /**
+     * 
+     * @return 
+     */
+    default boolean isEnabled() {
+        return true;
+    }
     /**
      * Get a reader that will iterate over the text extracted from the given
      * source.
@@ -59,8 +64,13 @@ interface TextExtractor<T> {
      * 
      * @param context Instance containing file config classes
      */
-    void setExtractionSettings(ExtractionContext context);
+    default void setExtractionSettings(ExtractionContext context) {
+        //no-op by default
+    }
     
+    /**
+     * 
+     */
     public class InitReaderException extends Exception {
         public InitReaderException(String msg, Throwable ex) {
             super(msg, ex);
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
index f4413d3f74..9820c53a43 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
@@ -50,8 +50,7 @@ public class TextReader {
      * @param file    Content source that will be read from
      * @param context Contains extraction configurations for certain file types
      *
-     * @return A ContentTextExtractor instance that is properly configured and
-     *         can be read from the getReader() method.
+     * @return A Reader that contains file source text
      *
      * @throws NoReaderFoundException In the event that the
      *                                             inputted file and mimetype
@@ -65,7 +64,7 @@ public class TextReader {
                 String mimeType = ((AbstractFile) file).getMIMEType();
                 for (TextExtractor<AbstractFile> candidate : fileExtractors) {
                     candidate.setExtractionSettings(context);
-                    if (candidate.isSupported((AbstractFile)file, mimeType)) {
+                    if (candidate.isEnabled() && candidate.isSupported((AbstractFile)file, mimeType)) {
                         return candidate.getReader((AbstractFile)file);
                     }
                 }
@@ -76,7 +75,7 @@ public class TextReader {
             } else if (file instanceof Report) {
                 TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
                 reportExtractor.setExtractionSettings(context);
-                reportExtractor.getReader((Report)file);
+                return reportExtractor.getReader((Report)file);
             }
         } catch (TextExtractor.InitReaderException ex) {
             throw new NoReaderFoundException(ex);
@@ -95,7 +94,7 @@ public class TextReader {
      * extractor should be used as a backup in the event that no specialized
      * extractor can be found.
      *
-     * @param source
+     * @param source Content source to read from
      * @param context Contains extraction configurations for certain file types
      *
      * @return A DefaultExtractor instance

From 79bb71359d09819ea822db03e0fae9c2a525bcb7 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 7 Dec 2018 15:04:57 -0500
Subject: [PATCH 13/18] Changed factory name

---
 ...TextReader.java => TextExtractorFactory.java} |  2 +-
 .../keywordsearch/KeywordSearchIngestModule.java |  8 ++++----
 .../autopsy/keywordsearch/SolrSearchService.java | 16 ++++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)
 rename Core/src/org/sleuthkit/autopsy/textextractors/{TextReader.java => TextExtractorFactory.java} (99%)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
similarity index 99%
rename from Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index 9820c53a43..ff90453f02 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextReader.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -32,7 +32,7 @@ import org.sleuthkit.datamodel.Report;
  * See ContentTextExtractor interface for the generic structure of such
  * extractors.
  */
-public class TextReader {
+public class TextExtractorFactory {
     
     private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
                 new HtmlTextExtractor<>(),
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 9af9a04648..14ab05dff9 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -44,7 +44,7 @@ import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.ExtractionContext;
-import org.sleuthkit.autopsy.textextractors.TextReader;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
 import org.sleuthkit.datamodel.AbstractFile;
@@ -484,10 +484,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
             
             try {
-                Reader specializedReader = TextReader.getContentSpecificReader(aFile,extractionContext);
+                Reader specializedReader = TextExtractorFactory.getContentSpecificReader(aFile,extractionContext);
                 //divide into chunks and index
                 return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
-            } catch (TextReader.NoReaderFoundException ex) {
+            } catch (TextExtractorFactory.NoReaderFoundException ex) {
                 //No text extractor found... run the default instead
                 return false;
             }
@@ -506,7 +506,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 if (context.fileIngestIsCancelled()) {
                     return true;
                 }
-                Reader stringsReader = TextReader.getDefaultReader(aFile, stringsExtractionContext);
+                Reader stringsReader = TextExtractorFactory.getDefaultReader(aFile, stringsExtractionContext);
                 if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                     return true;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index dc78e751b8..aaee181204 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -46,7 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
 import org.sleuthkit.autopsy.progress.ProgressIndicator;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.autopsy.textextractors.TextReader;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
@@ -114,23 +114,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
                 return;
             }
             try {
-                Reader blackboardReader = TextReader
+                Reader blackboardReader = TextExtractorFactory
                         .getContentSpecificReader(content, null);
                 String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
                 ingester.indexMetaDataOnly(artifact, sourceName);
                 ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
-            } catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
+            } catch (Ingester.IngesterException | TextExtractorFactory.NoReaderFoundException ex) {
                 throw new TskCoreException(ex.getCause().getMessage(), ex);
             }
         } else {
             try {
-                Reader contentReader = TextReader
+                Reader contentReader = TextExtractorFactory
                         .getContentSpecificReader(content, null);
                 ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
-            } catch (TextReader.NoReaderFoundException | Ingester.IngesterException ex) {
+            } catch (TextExtractorFactory.NoReaderFoundException | Ingester.IngesterException ex) {
                 try {
                     // Try the StringsTextExtractor if Tika extractions fails.
-                    ingester.indexText(TextReader.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
+                    ingester.indexText(TextExtractorFactory.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
                 } catch (Ingester.IngesterException ex1) {
                     throw new TskCoreException(ex.getCause().getMessage(), ex1);
                 }
@@ -446,10 +446,10 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
         try {
             String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
             Reader contentSpecificReader = 
-                    TextReader.getContentSpecificReader((Content) artifact, null);
+                    TextExtractorFactory.getContentSpecificReader((Content) artifact, null);
             ingester.indexMetaDataOnly(artifact, sourceName);
             ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
-        } catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
+        } catch (Ingester.IngesterException | TextExtractorFactory.NoReaderFoundException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
     }

From fe3927ea86138ea77f05d1dbd7d590243298a7b3 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 7 Dec 2018 17:23:14 -0500
Subject: [PATCH 14/18] Fixed public api

---
 .../textextractors/ArtifactTextExtractor.java | 12 +++-
 .../textextractors/HtmlTextExtractor.java     | 14 +++--
 .../textextractors/InitReaderException.java   | 23 +++++++
 .../textextractors/SqliteTextExtractor.java   | 13 ++--
 .../textextractors/StringsTextExtractor.java  | 24 +++----
 .../autopsy/textextractors/TextExtractor.java | 31 +++-------
 .../textextractors/TextExtractorFactory.java  | 62 ++++++++-----------
 .../textextractors/TikaTextExtractor.java     |  8 ++-
 .../KeywordSearchIngestModule.java            | 10 +--
 .../keywordsearch/SolrSearchService.java      | 18 +++---
 10 files changed, 115 insertions(+), 100 deletions(-)
 create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index 070fa84231..304437de59 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -32,10 +32,16 @@ import org.sleuthkit.datamodel.TskCoreException;
  * Extracts text from artifacts by concatenating the values of all of the
  * artifact's attributes.
  */
-class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {
+class ArtifactTextExtractor extends TextExtractor<Content> {
 
+    private final BlackboardArtifact artifact;
+    
+    public ArtifactTextExtractor(Content artifact) {
+        this.artifact = (BlackboardArtifact) artifact;
+    }
+    
     @Override
-    public Reader getReader(BlackboardArtifact artifact) throws InitReaderException {
+    public Reader getReader() throws InitReaderException {
         // Concatenate the string values of all attributes into a single
         // "content" string to be indexed.
         StringBuilder artifactContents = new StringBuilder();
@@ -77,7 +83,7 @@ class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtract
     }
 
     @Override
-    public boolean isSupported(BlackboardArtifact file, String detectedFormat) {
+    public boolean isSupported(Content file, String detectedFormat) {
         return true;
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index fd511e2779..a6f05e7aa3 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -32,16 +32,17 @@ import net.htmlparser.jericho.Source;
 import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
  * Extracts text from HTML content.
  */
-final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
+final class HtmlTextExtractor extends TextExtractor<Content> {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
     private final int MAX_SIZE;
+    private final Content file;
 
     static final List<String> WEB_MIME_TYPES = Arrays.asList(
             "application/javascript", //NON-NLS
@@ -61,9 +62,10 @@ final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T
      * Creates a default instance of the HtmlTextExtractor. Supported file size
      * is 50MB.
      */
-    public HtmlTextExtractor() {
+    public HtmlTextExtractor(Content file) {
         //Set default to be 50 MB.
         MAX_SIZE = 50_000_000;
+        this.file = file;
     }
 
     /**
@@ -75,7 +77,7 @@ final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T
      * @return flag indicating support
      */
     @Override
-    public boolean isSupported(AbstractFile content, String detectedFormat) {
+    public boolean isSupported(Content content, String detectedFormat) {
         return detectedFormat != null
                 && WEB_MIME_TYPES.contains(detectedFormat)
                 && content.getSize() <= MAX_SIZE;
@@ -91,10 +93,10 @@ final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T
      * @throws TextExtractorException
      */
     @Override
-    public Reader getReader(AbstractFile content) throws InitReaderException {
+    public Reader getReader() throws InitReaderException {
         //TODO JIRA-4467, there is only harm in excluding HTML documents greater
         //than 50MB due to our troubled approach of extraction.
-        ReadContentInputStream stream = new ReadContentInputStream(content);
+        ReadContentInputStream stream = new ReadContentInputStream(file);
 
         //Parse the stream with Jericho and put the results in a Reader
         try {
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java b/Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java
new file mode 100755
index 0000000000..89f0beb057
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java
@@ -0,0 +1,23 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.textextractors;
+
+/**
+ * 
+ */
+public class InitReaderException extends Exception {
+    public InitReaderException(String msg, Throwable ex) {
+        super(msg, ex);
+    }
+
+    public InitReaderException(Throwable ex) {
+        super(ex);
+    }
+
+    public InitReaderException(String msg) {
+        super(msg);
+    }
+}
\ No newline at end of file
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index 19bcd7ac87..e488cbbdba 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -28,6 +28,7 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
 
 /**
  * Extracts text from SQLite database files.
@@ -38,11 +39,15 @@ import org.sleuthkit.datamodel.AbstractFile;
  *  2) Tables that contain spaces in their name are not extracted
  *  3) Table names are not included in its output text
  */
-final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
+final class SqliteTextExtractor extends TextExtractor<Content> {
 
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
+    private final AbstractFile file;
 
+    public SqliteTextExtractor(Content file) {
+        this.file = (AbstractFile) file;
+    }
     /**
      * Supports only the sqlite mimetypes
      *
@@ -52,7 +57,7 @@ final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor
      * @return true if x-sqlite3
      */
     @Override
-    public boolean isSupported(AbstractFile file, String detectedFormat) {
+    public boolean isSupported(Content file, String detectedFormat) {
         return SQLITE_MIMETYPE.equals(detectedFormat);
     }
 
@@ -66,8 +71,8 @@ final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor
      * @throws TextExtractorException
      */
     @Override
-    public Reader getReader(AbstractFile source) throws InitReaderException {
-        return new SQLiteStreamReader(source);
+    public Reader getReader() throws InitReaderException {
+        return new SQLiteStreamReader(file);
     }
     
     /**
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index e4a605420b..d078b46f6d 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -35,10 +35,11 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-final class StringsTextExtractor<T extends Content> implements TextExtractor<T> {
+final class StringsTextExtractor extends TextExtractor<Content> {
 
     private boolean extractUTF8;
     private boolean extractUTF16;
+    private final Content content;
     private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
 
     private final List<SCRIPT> extractScripts = new ArrayList<>();
@@ -48,10 +49,11 @@ final class StringsTextExtractor<T extends Content> implements TextExtractor<T>
      * configured to run only LATIN_2 as its default extraction script and UTF-8
      * as its default encoding.
      */
-    public StringsTextExtractor() {
+    public StringsTextExtractor(Content content) {
         //LATIN_2 is the default script
         extractScripts.add(SCRIPT.LATIN_2);
         extractUTF8 = true;
+        this.content = content;
     }
 
     /**
@@ -79,7 +81,7 @@ final class StringsTextExtractor<T extends Content> implements TextExtractor<T>
      * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
     @Override
-    public InputStreamReader getReader(Content content) {
+    public InputStreamReader getReader() {
         InputStream stringStream = getInputStream(content);
         return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
     }
@@ -118,17 +120,6 @@ final class StringsTextExtractor<T extends Content> implements TextExtractor<T>
         }
     }
 
-    /**
-     * 
-     * @param file
-     * @param detectedFormat
-     * @return 
-     */
-    @Override
-    public boolean isSupported(T file, String detectedFormat) {
-        return true;
-    }
-
     /**
      * 
      * @return 
@@ -138,6 +129,11 @@ final class StringsTextExtractor<T extends Content> implements TextExtractor<T>
         return extractUTF8 || extractUTF16;
     }
 
+    @Override
+    boolean isSupported(Content file, String detectedFormat) {
+        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
+    }
+
     /**
      * Content input string stream reader/converter - given Content, extract
      * strings from it and return encoded bytes via read()
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index 7cc05d7432..7dadedbfe3 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -25,7 +25,7 @@ import java.io.Reader;
  *
  * @param <T> Generic data type T
  */
-interface TextExtractor<T> {
+public abstract class TextExtractor<T> {
     
      /**
      * Determines if the file content is supported by the extractor.
@@ -36,25 +36,25 @@ interface TextExtractor<T> {
      *
      * @return true if the file content is supported, false otherwise
      */
-    boolean isSupported(T file, String detectedFormat);
+    abstract boolean isSupported(T file, String detectedFormat);
     
     /**
      * 
      * @return 
      */
-    default boolean isEnabled() {
+    boolean isEnabled() {
         return true;
     }
+    
     /**
      * Get a reader that will iterate over the text extracted from the given
      * source.
      *
-     * @param source source content of type T
-     *
      * @return Reader instance that contains the text of the source
+     * @throws org.sleuthkit.autopsy.textextractors.InitReaderException
      *
      */
-    Reader getReader(T source) throws InitReaderException;
+    public abstract Reader getReader() throws InitReaderException;
        
     /**
      * Determines how the extraction process will proceed given the settings 
@@ -64,24 +64,7 @@ interface TextExtractor<T> {
      * 
      * @param context Instance containing file config classes
      */
-    default void setExtractionSettings(ExtractionContext context) {
+    void setExtractionSettings(ExtractionContext context) {
         //no-op by default
     }
-    
-    /**
-     * 
-     */
-    public class InitReaderException extends Exception {
-        public InitReaderException(String msg, Throwable ex) {
-            super(msg, ex);
-        }
-        
-        public InitReaderException(Throwable ex) {
-            super(ex);
-        }
-        
-        public InitReaderException(String msg) {
-            super(msg);
-        }
-    }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index ff90453f02..d88c9f6f72 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -18,7 +18,6 @@
  */
 package org.sleuthkit.autopsy.textextractors;
 
-import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
@@ -33,12 +32,6 @@ import org.sleuthkit.datamodel.Report;
  * extractors.
  */
 public class TextExtractorFactory {
-    
-    private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
-                new HtmlTextExtractor<>(),
-                new SqliteTextExtractor<>(),
-                new TikaTextExtractor<>()
-        );
     /**
      * Auto detects the correct text extractor given the file.
      *
@@ -52,36 +45,35 @@ public class TextExtractorFactory {
      *
      * @return A Reader that contains file source text
      *
-     * @throws NoReaderFoundException In the event that the
+     * @throws NoTextExtractorFound In the event that the
      *                                             inputted file and mimetype
      *                                             have no corresponding
      *                                             extractor
      */
-    public static Reader getContentSpecificReader(Content file,
-            ExtractionContext context) throws NoReaderFoundException {
-        try {
-            if (file instanceof AbstractFile) {
-                String mimeType = ((AbstractFile) file).getMIMEType();
-                for (TextExtractor<AbstractFile> candidate : fileExtractors) {
-                    candidate.setExtractionSettings(context);
-                    if (candidate.isEnabled() && candidate.isSupported((AbstractFile)file, mimeType)) {
-                        return candidate.getReader((AbstractFile)file);
-                    }
+    public static TextExtractor<Content> getReader(Content file,
+            ExtractionContext context) throws NoTextExtractorFound {
+        if (file instanceof AbstractFile) {
+            String mimeType = ((AbstractFile) file).getMIMEType();   
+            List<TextExtractor<Content>> extractors = Arrays.asList(
+                    new HtmlTextExtractor(file),
+                    new SqliteTextExtractor(file), 
+                    new TikaTextExtractor(file));
+            for(TextExtractor<Content> extractor : extractors) {
+                if(extractor.isEnabled() && extractor.isSupported(file, mimeType)) {
+                    return extractor;
                 }
-            } else if (file instanceof BlackboardArtifact) {
-                TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor<>();
-                artifactExtractor.setExtractionSettings(context);
-                return artifactExtractor.getReader((BlackboardArtifact)file);
-            } else if (file instanceof Report) {
-                TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
-                reportExtractor.setExtractionSettings(context);
-                return reportExtractor.getReader((Report)file);
             }
-        } catch (TextExtractor.InitReaderException ex) {
-            throw new NoReaderFoundException(ex);
+        } else if (file instanceof BlackboardArtifact) {
+            TextExtractor<Content> artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact)file);
+            artifactExtractor.setExtractionSettings(context);
+            return artifactExtractor;
+        } else if (file instanceof Report) {
+            TextExtractor<Content> reportExtractor = new TikaTextExtractor(file);
+            reportExtractor.setExtractionSettings(context);
+            return reportExtractor;
         }
         
-        throw new NoReaderFoundException(
+        throw new NoTextExtractorFound(
                 String.format("Could not find a suitable extractor for "
                         + "file with name [%s] and id=[%d]. Try using the default, "
                         + "non content specific extractor as an alternative.",
@@ -99,23 +91,23 @@ public class TextExtractorFactory {
      *
      * @return A DefaultExtractor instance
      */
-    public static Reader getDefaultReader(Content source, ExtractionContext context) {
-        StringsTextExtractor stringsInstance = new StringsTextExtractor();
+    public static TextExtractor<Content> getDefaultReader(Content source, ExtractionContext context) {
+        StringsTextExtractor stringsInstance = new StringsTextExtractor(source);
         stringsInstance.setExtractionSettings(context);
-        return stringsInstance.getReader(source);
+        return stringsInstance;
     }
 
     /**
      * System level exception for handling content types that have no specific
      * strategy defined for extracting their text.
      */
-    public static class NoReaderFoundException extends Exception {
+    public static class NoTextExtractorFound extends Exception {
 
-        public NoReaderFoundException(String msg) {
+        public NoTextExtractorFound(String msg) {
             super(msg);
         }
         
-        public NoReaderFoundException(Throwable ex) {
+        public NoTextExtractorFound(Throwable ex) {
             super(ex);
         }
     }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 200a8618fa..f1df671715 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -55,7 +55,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * Extracts text from Tika supported content. Protects against Tika
  * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
  */
-final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
+final class TikaTextExtractor extends TextExtractor<Content> {
     
       //Mimetype groups to aassist extractor implementations in ignoring binary and 
     //archive files.
@@ -109,6 +109,7 @@ final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
 
     private final AutoDetectParser parser = new AutoDetectParser();
+    private final Content content;
     
     private boolean tesseractOCREnabled;
     private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
@@ -121,6 +122,9 @@ final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
                     .map(mt -> mt.getType() + "/" + mt.getSubtype())
                     .collect(Collectors.toList());
 
+    public TikaTextExtractor(Content content) {
+        this.content = content;
+    }
     /**
      * Returns a reader that will iterate over the text extracted from Apache 
      * Tika. 
@@ -131,7 +135,7 @@ final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
      * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
      */
     @Override
-    public Reader getReader(Content content) throws InitReaderException {
+    public Reader getReader() throws InitReaderException {
         ReadContentInputStream stream = new ReadContentInputStream(content);
 
         Metadata metadata = new Metadata();
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 14ab05dff9..340e533eb0 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -44,6 +44,8 @@ import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.ExtractionContext;
+import org.sleuthkit.autopsy.textextractors.InitReaderException;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
@@ -484,10 +486,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
             
             try {
-                Reader specializedReader = TextExtractorFactory.getContentSpecificReader(aFile,extractionContext);
+                Reader specializedReader = TextExtractorFactory.getReader(aFile,extractionContext).getReader();
                 //divide into chunks and index
                 return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
-            } catch (TextExtractorFactory.NoReaderFoundException ex) {
+            } catch (TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {
                 //No text extractor found... run the default instead
                 return false;
             }
@@ -506,7 +508,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 if (context.fileIngestIsCancelled()) {
                     return true;
                 }
-                Reader stringsReader = TextExtractorFactory.getDefaultReader(aFile, stringsExtractionContext);
+                Reader stringsReader = TextExtractorFactory.getDefaultReader(aFile, stringsExtractionContext).getReader();
                 if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                     return true;
@@ -515,7 +517,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
                     return false;
                 }
-            } catch (IngesterException ex) {
+            } catch (IngesterException | InitReaderException ex) {
                 logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex);  //NON-NLS
                 putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
                 return false;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index aaee181204..4d1694a424 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -46,6 +46,8 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
 import org.sleuthkit.autopsy.progress.ProgressIndicator;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
+import org.sleuthkit.autopsy.textextractors.InitReaderException;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
@@ -115,23 +117,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
             }
             try {
                 Reader blackboardReader = TextExtractorFactory
-                        .getContentSpecificReader(content, null);
+                        .getReader(content, null).getReader();
                 String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
                 ingester.indexMetaDataOnly(artifact, sourceName);
                 ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
-            } catch (Ingester.IngesterException | TextExtractorFactory.NoReaderFoundException ex) {
+            } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {
                 throw new TskCoreException(ex.getCause().getMessage(), ex);
             }
         } else {
             try {
                 Reader contentReader = TextExtractorFactory
-                        .getContentSpecificReader(content, null);
+                        .getReader(content, null).getReader();
                 ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
-            } catch (TextExtractorFactory.NoReaderFoundException | Ingester.IngesterException ex) {
+            } catch (TextExtractorFactory.NoTextExtractorFound | InitReaderException | Ingester.IngesterException ex) {
                 try {
                     // Try the StringsTextExtractor if Tika extractions fails.
-                    ingester.indexText(TextExtractorFactory.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
-                } catch (Ingester.IngesterException ex1) {
+                    ingester.indexText(TextExtractorFactory.getDefaultReader(content, null).getReader(),content.getId(),content.getName(), content, null);
+                } catch (Ingester.IngesterException | InitReaderException ex1) {
                     throw new TskCoreException(ex.getCause().getMessage(), ex1);
                 }
             }
@@ -446,10 +448,10 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
         try {
             String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
             Reader contentSpecificReader = 
-                    TextExtractorFactory.getContentSpecificReader((Content) artifact, null);
+                    TextExtractorFactory.getReader((Content) artifact, null).getReader();
             ingester.indexMetaDataOnly(artifact, sourceName);
             ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
-        } catch (Ingester.IngesterException | TextExtractorFactory.NoReaderFoundException ex) {
+        } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
     }

From a7e78131c47f8ca033d90c18881132a28eae6aaf Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 7 Dec 2018 17:24:49 -0500
Subject: [PATCH 15/18] Changed name

---
 .../autopsy/textextractors/TextExtractorFactory.java      | 4 ++--
 .../autopsy/keywordsearch/KeywordSearchIngestModule.java  | 4 ++--
 .../autopsy/keywordsearch/SolrSearchService.java          | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index d88c9f6f72..4b1b46b997 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -50,7 +50,7 @@ public class TextExtractorFactory {
      *                                             have no corresponding
      *                                             extractor
      */
-    public static TextExtractor<Content> getReader(Content file,
+    public static TextExtractor<Content> getExtractor(Content file,
             ExtractionContext context) throws NoTextExtractorFound {
         if (file instanceof AbstractFile) {
             String mimeType = ((AbstractFile) file).getMIMEType();   
@@ -91,7 +91,7 @@ public class TextExtractorFactory {
      *
      * @return A DefaultExtractor instance
      */
-    public static TextExtractor<Content> getDefaultReader(Content source, ExtractionContext context) {
+    public static TextExtractor<Content> getDefaultExtractor(Content source, ExtractionContext context) {
         StringsTextExtractor stringsInstance = new StringsTextExtractor(source);
         stringsInstance.setExtractionSettings(context);
         return stringsInstance;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 340e533eb0..edb9e12b86 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -486,7 +486,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
             extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
             
             try {
-                Reader specializedReader = TextExtractorFactory.getReader(aFile,extractionContext).getReader();
+                Reader specializedReader = TextExtractorFactory.getExtractor(aFile,extractionContext).getReader();
                 //divide into chunks and index
                 return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
             } catch (TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {
@@ -508,7 +508,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 if (context.fileIngestIsCancelled()) {
                     return true;
                 }
-                Reader stringsReader = TextExtractorFactory.getDefaultReader(aFile, stringsExtractionContext).getReader();
+                Reader stringsReader = TextExtractorFactory.getDefaultExtractor(aFile, stringsExtractionContext).getReader();
                 if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                     return true;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index 4d1694a424..ea178cd203 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -117,7 +117,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
             }
             try {
                 Reader blackboardReader = TextExtractorFactory
-                        .getReader(content, null).getReader();
+                        .getExtractor(content, null).getReader();
                 String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
                 ingester.indexMetaDataOnly(artifact, sourceName);
                 ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
@@ -127,12 +127,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
         } else {
             try {
                 Reader contentReader = TextExtractorFactory
-                        .getReader(content, null).getReader();
+                        .getExtractor(content, null).getReader();
                 ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
             } catch (TextExtractorFactory.NoTextExtractorFound | InitReaderException | Ingester.IngesterException ex) {
                 try {
                     // Try the StringsTextExtractor if Tika extractions fails.
-                    ingester.indexText(TextExtractorFactory.getDefaultReader(content, null).getReader(),content.getId(),content.getName(), content, null);
+                    ingester.indexText(TextExtractorFactory.getDefaultExtractor(content, null).getReader(),content.getId(),content.getName(), content, null);
                 } catch (Ingester.IngesterException | InitReaderException ex1) {
                     throw new TskCoreException(ex.getCause().getMessage(), ex1);
                 }
@@ -448,7 +448,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
         try {
             String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
             Reader contentSpecificReader = 
-                    TextExtractorFactory.getReader((Content) artifact, null).getReader();
+                    TextExtractorFactory.getExtractor((Content) artifact, null).getReader();
             ingester.indexMetaDataOnly(artifact, sourceName);
             ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
         } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {

From 41a0435d0f011c495323da0b1883d2381f1a0229 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Mon, 10 Dec 2018 10:33:22 -0500
Subject: [PATCH 16/18] Improved documentation and naming conventions

---
 .../textextractors/ArtifactTextExtractor.java |  10 +-
 .../textextractors/ExtractionContext.java     |  72 -----------
 .../textextractors/HtmlTextExtractor.java     |   6 +-
 .../textextractors/InitReaderException.java   |  23 ----
 .../textextractors/SqliteTextExtractor.java   |   4 +-
 .../textextractors/StringsTextExtractor.java  |  14 +-
 .../autopsy/textextractors/TextExtractor.java |  73 +++++++----
 .../textextractors/TextExtractorFactory.java  | 120 ++++++++++++------
 .../textextractors/TikaTextExtractor.java     |  22 ++--
 .../DefaultExtractionConfig.java              |  38 +++---
 .../ImageFileExtractionConfig.java            |  15 ++-
 .../KeywordSearchIngestModule.java            |  13 +-
 12 files changed, 203 insertions(+), 207 deletions(-)
 delete mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
 delete mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index 304437de59..78ad29bef7 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -32,7 +32,7 @@ import org.sleuthkit.datamodel.TskCoreException;
  * Extracts text from artifacts by concatenating the values of all of the
  * artifact's attributes.
  */
-class ArtifactTextExtractor extends TextExtractor<Content> {
+class ArtifactTextExtractor extends TextExtractor {
 
     private final BlackboardArtifact artifact;
     
@@ -41,7 +41,7 @@ class ArtifactTextExtractor extends TextExtractor<Content> {
     }
     
     @Override
-    public Reader getReader() throws InitReaderException {
+    public Reader getReader() throws ExtractionException {
         // Concatenate the string values of all attributes into a single
         // "content" string to be indexed.
         StringBuilder artifactContents = new StringBuilder();
@@ -50,10 +50,10 @@ class ArtifactTextExtractor extends TextExtractor<Content> {
         try {
             dataSource = artifact.getDataSource();
         } catch (TskCoreException tskCoreException) {
-            throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
+            throw new ExtractionException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
         }
         if (dataSource == null) {
-            throw new InitReaderException("Datasource was null for artifact: " + artifact.toString());
+            throw new ExtractionException("Datasource was null for artifact: " + artifact.toString());
         }
 
         try {
@@ -75,7 +75,7 @@ class ArtifactTextExtractor extends TextExtractor<Content> {
                 artifactContents.append(System.lineSeparator());
             }
         } catch (TskCoreException tskCoreException) {
-            throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
+            throw new ExtractionException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
         }
 
         return new InputStreamReader(IOUtils.toInputStream(artifactContents,
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
deleted file mode 100755
index d1e267ea6c..0000000000
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2018-2018 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.textextractors;
-
-import com.google.common.collect.ClassToInstanceMap;
-import com.google.common.collect.MutableClassToInstanceMap;
-
-/**
- * Stores extraction config instances for media types.
- *
- * TextExtractors parse this class when configuring their own extraction
- * settings.
- */
-public class ExtractionContext {
-
-    ClassToInstanceMap<Object> extractionConfigs;
-
-    public ExtractionContext() {
-        extractionConfigs = MutableClassToInstanceMap.create();
-    }
-
-    /**
-     * Internally stores a class-instance pair.
-     *
-     * @param <T>            Class type that will be stored
-     * @param configClass    The class object of the instance
-     * @param configInstance Config instance of type T
-     */
-    public <T> void set(Class<T> configClass, T configInstance) {
-        extractionConfigs.put(configClass, configInstance);
-    }
-
-    /**
-     * Retrieves the config instance associated with this key.
-     *
-     * @param <T>         Type of the stored instance
-     * @param configClass The class object of the instance
-     *
-     * @return The config instance of type T
-     */
-    public <T> T get(Class<T> configClass) {
-        return configClass.cast(extractionConfigs.get(configClass));
-    }
-
-    /**
-     * Indicates if this class key has been stored.
-     *
-     * @param <T>         Type of the stored instance
-     * @param configClass The class object of the instance
-     *
-     * @return flag indicating the presense of this instance
-     */
-    public <T> boolean contains(Class<T> configClass) {
-        return get(configClass) != null;
-    }
-}
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index a6f05e7aa3..86dbd15c1b 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 /**
  * Extracts text from HTML content.
  */
-final class HtmlTextExtractor extends TextExtractor<Content> {
+final class HtmlTextExtractor extends TextExtractor {
 
     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
     private final int MAX_SIZE;
@@ -93,7 +93,7 @@ final class HtmlTextExtractor extends TextExtractor<Content> {
      * @throws TextExtractorException
      */
     @Override
-    public Reader getReader() throws InitReaderException {
+    public Reader getReader() throws ExtractionException {
         //TODO JIRA-4467, there is only harm in excluding HTML documents greater
         //than 50MB due to our troubled approach of extraction.
         ReadContentInputStream stream = new ReadContentInputStream(file);
@@ -190,7 +190,7 @@ final class HtmlTextExtractor extends TextExtractor<Content> {
             return new StringReader(stringBuilder.toString());
         } catch (IOException ex) {
             logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
-            throw new InitReaderException("Error extracting HTML from content.", ex);
+            throw new ExtractionException("Error extracting HTML from content.", ex);
         }
     }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java b/Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java
deleted file mode 100755
index 89f0beb057..0000000000
--- a/Core/src/org/sleuthkit/autopsy/textextractors/InitReaderException.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
-package org.sleuthkit.autopsy.textextractors;
-
-/**
- * 
- */
-public class InitReaderException extends Exception {
-    public InitReaderException(String msg, Throwable ex) {
-        super(msg, ex);
-    }
-
-    public InitReaderException(Throwable ex) {
-        super(ex);
-    }
-
-    public InitReaderException(String msg) {
-        super(msg);
-    }
-}
\ No newline at end of file
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index e488cbbdba..ea204d5e30 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.Content;
  *  2) Tables that contain spaces in their name are not extracted
  *  3) Table names are not included in its output text
  */
-final class SqliteTextExtractor extends TextExtractor<Content> {
+final class SqliteTextExtractor extends TextExtractor {
 
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
@@ -71,7 +71,7 @@ final class SqliteTextExtractor extends TextExtractor<Content> {
      * @throws TextExtractorException
      */
     @Override
-    public Reader getReader() throws InitReaderException {
+    public Reader getReader() throws ExtractionException {
         return new SQLiteStreamReader(file);
     }
     
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index d078b46f6d..7ed6683c08 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -25,6 +25,7 @@ import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
+import org.openide.util.Lookup;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
@@ -35,7 +36,7 @@ import org.sleuthkit.datamodel.TskException;
 /**
  * Extracts raw strings from content.
  */
-final class StringsTextExtractor extends TextExtractor<Content> {
+final class StringsTextExtractor extends TextExtractor {
 
     private boolean extractUTF8;
     private boolean extractUTF16;
@@ -102,12 +103,15 @@ final class StringsTextExtractor extends TextExtractor<Content> {
      * See the DefaultExtractionConfig class in the extractionconfigs package
      * for available settings.
      *
-     * @param context Instance containing config classes
+     * @param context Lookup instance containing config classes
      */
     @Override
-    public void setExtractionSettings(ExtractionContext context) {
-        if (context != null && context.contains(DefaultExtractionConfig.class)) {
-            DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
+    public void setExtractionSettings(Lookup context) {
+        if(context != null) {
+            DefaultExtractionConfig configInstance = context.lookup(DefaultExtractionConfig.class);
+            if(configInstance == null) {
+                return;
+            }
             if (Objects.nonNull(configInstance.getExtractUTF8())) {
                 extractUTF8 = configInstance.getExtractUTF8();
             }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index 7dadedbfe3..d548ea39ec 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -19,15 +19,22 @@
 package org.sleuthkit.autopsy.textextractors;
 
 import java.io.Reader;
+import org.openide.util.Lookup;
+import org.sleuthkit.datamodel.Content;
 
 /**
- * Extracts text out of Objects and exposes it as a Reader.
+ * Extracts the text out of {@link org.sleuthkit.datamodel.Content} instances
+ * and exposes them as a {@link java.io.Reader}. Concrete implementations can be
+ * obtained from
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}
+ * or
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
  *
- * @param <T> Generic data type T
+ * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
  */
-public abstract class TextExtractor<T> {
-    
-     /**
+public abstract class TextExtractor {
+
+    /**
      * Determines if the file content is supported by the extractor.
      *
      * @param file           to test if its content should be supported
@@ -36,35 +43,57 @@ public abstract class TextExtractor<T> {
      *
      * @return true if the file content is supported, false otherwise
      */
-    abstract boolean isSupported(T file, String detectedFormat);
-    
+    abstract boolean isSupported(Content file, String detectedFormat);
+
     /**
-     * 
-     * @return 
+     * Determines if the TextExtractor instance is enabled to read content.
+     *
+     * @return
      */
     boolean isEnabled() {
         return true;
     }
-    
+
     /**
-     * Get a reader that will iterate over the text extracted from the given
-     * source.
+     * Get a {@link java.io.Reader} that will iterate over the text extracted
+     * from the {@link org.sleuthkit.datamodel.Content} used to create this
+     * TextExtractor instance.
      *
-     * @return Reader instance that contains the text of the source
-     * @throws org.sleuthkit.autopsy.textextractors.InitReaderException
+     * @return {@link java.io.Reader} that contains the text of the underlying
+     *         {@link org.sleuthkit.datamodel.Content}
+     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
+     *
+     * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
      *
      */
-    public abstract Reader getReader() throws InitReaderException;
-       
+    public abstract Reader getReader() throws ExtractionException;
+
     /**
-     * Determines how the extraction process will proceed given the settings 
-     * stored in this context instance.
-     * 
-     * See the extractionconfigs package for available file configurations.
-     * 
+     * Determines how the extraction process will proceed given the settings
+     * stored in the context instance.
+     *
      * @param context Instance containing file config classes
      */
-    void setExtractionSettings(ExtractionContext context) {
+    void setExtractionSettings(Lookup context) {
         //no-op by default
     }
+    
+    /**
+    * Exception encountered during {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
+    * This indicates that there was an internal parsing error that occurred during the 
+    */
+   public class ExtractionException extends Exception {
+
+       public ExtractionException(String msg, Throwable ex) {
+           super(msg, ex);
+       }
+
+       public ExtractionException(Throwable ex) {
+           super(ex);
+       }
+
+       public ExtractionException(String msg) {
+           super(msg);
+       }
+   }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index 4b1b46b997..13ded777d6 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -20,79 +20,115 @@ package org.sleuthkit.autopsy.textextractors;
 
 import java.util.Arrays;
 import java.util.List;
+import org.openide.util.Lookup;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.Report;
 
 /**
- * Factory for creating text extractors given a source file
+ * Factory for creating
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractor}'s given a
+ * {@link org.sleuthkit.datamodel.Content} instance
  *
- * See ContentTextExtractor interface for the generic structure of such
- * extractors.
+ * See {@link org.sleuthkit.autopsy.textextractors.extractionconfigs} for
+ * available {@link org.sleuthkit.autopsy.textextractors.TextExtractor}
+ * configuration options.
+ *
+ * @see org.openide.util.Lookup
  */
 public class TextExtractorFactory {
+
     /**
-     * Auto detects the correct text extractor given the file.
+     * Auto detects the correct
+     * {@link org.sleuthkit.autopsy.textextractors.TextExtractor} given the
+     * {@link org.sleuthkit.datamodel.Content}.
      *
-     * ContentTextExtractor can be configured using the ExtractionContext
-     * object. Passing in null or a new unmodified instance of ExtractionContext
-     * will keep the extractors at default settings. Refer to the
-     * extractionconfigs package for available file configurations.
+     * See {@link org.sleuthkit.autopsy.textextractors.extractionconfigs} for
+     * available {@link org.sleuthkit.autopsy.textextractors.TextExtractor}
+     * configuration options.
      *
-     * @param file    Content source that will be read from
+     * @param content Content source that will be read from
      * @param context Contains extraction configurations for certain file types
      *
-     * @return A Reader that contains file source text
+     * @return A TextExtractor that supports the given content. File text can be
+     *         obtained from {@link TextExtractor#getReader()}.
      *
-     * @throws NoTextExtractorFound In the event that the
-     *                                             inputted file and mimetype
-     *                                             have no corresponding
-     *                                             extractor
+     * @throws NoTextExtractorFound Encountered when there is no TextExtractor
+     *                              was found for the given content type. Use {@link
+     *                              TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
+     *                              org.openide.util.Lookup)}
+     *
+     * @see org.openide.util.Lookup
      */
-    public static TextExtractor<Content> getExtractor(Content file,
-            ExtractionContext context) throws NoTextExtractorFound {
-        if (file instanceof AbstractFile) {
-            String mimeType = ((AbstractFile) file).getMIMEType();   
-            List<TextExtractor<Content>> extractors = Arrays.asList(
-                    new HtmlTextExtractor(file),
-                    new SqliteTextExtractor(file), 
-                    new TikaTextExtractor(file));
-            for(TextExtractor<Content> extractor : extractors) {
-                if(extractor.isEnabled() && extractor.isSupported(file, mimeType)) {
+    public static TextExtractor getExtractor(Content content,
+            Lookup context) throws NoTextExtractorFound {
+        if (content instanceof AbstractFile) {
+            String mimeType = ((AbstractFile) content).getMIMEType();
+            List<TextExtractor> extractors = Arrays.asList(
+                    new HtmlTextExtractor(content),
+                    new SqliteTextExtractor(content),
+                    new TikaTextExtractor(content));
+            for (TextExtractor extractor : extractors) {
+                extractor.setExtractionSettings(context);
+                if (extractor.isEnabled() && extractor.isSupported(content, mimeType)) {
                     return extractor;
                 }
             }
-        } else if (file instanceof BlackboardArtifact) {
-            TextExtractor<Content> artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact)file);
+        } else if (content instanceof BlackboardArtifact) {
+            TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
             artifactExtractor.setExtractionSettings(context);
             return artifactExtractor;
-        } else if (file instanceof Report) {
-            TextExtractor<Content> reportExtractor = new TikaTextExtractor(file);
+        } else if (content instanceof Report) {
+            TextExtractor reportExtractor = new TikaTextExtractor(content);
             reportExtractor.setExtractionSettings(context);
             return reportExtractor;
         }
-        
+
         throw new NoTextExtractorFound(
                 String.format("Could not find a suitable extractor for "
-                        + "file with name [%s] and id=[%d]. Try using the default, "
+                        + "content with name [%s] and id=[%d]. Try using the default, "
                         + "non content specific extractor as an alternative.",
-                        file.getName(), file.getId())
+                        content.getName(), content.getId())
         );
     }
 
     /**
-     * Returns the default extractor that can be run on any content type. This
-     * extractor should be used as a backup in the event that no specialized
-     * extractor can be found.
+     * Auto detects the correct
+     * {@link org.sleuthkit.autopsy.textextractors.TextExtractor} given the
+     * {@link org.sleuthkit.datamodel.Content}.
      *
-     * @param source Content source to read from
+     * @param content Content instance that will be read from
+     *
+     * @return A TextExtractor that supports the given content. File text can be
+     *         obtained from {@link TextExtractor#getReader()}.
+     *
+     * @throws NoTextExtractorFound Encountered when there is no TextExtractor
+     *                              was found for the given content type. Use {@link
+     *                              TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
+     *                              org.openide.util.Lookup)}
+     */
+    public static TextExtractor getExtractor(Content content)
+            throws NoTextExtractorFound {
+        return getExtractor(content, null);
+    }
+
+    /**
+     * Returns the default extractor that can be run on any content type. This
+     * extractor should be used as a backup in the event that no extractor was
+     * found using or {@link TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}
+     * {@link TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}.
+     *
+     * @param content Content source to read from
      * @param context Contains extraction configurations for certain file types
      *
-     * @return A DefaultExtractor instance
+     * @return A DefaultExtractor instance. File text can be obtained from
+     *         {@link TextExtractor#getReader()}.
+     *
+     * @see org.openide.util.Lookup
      */
-    public static TextExtractor<Content> getDefaultExtractor(Content source, ExtractionContext context) {
-        StringsTextExtractor stringsInstance = new StringsTextExtractor(source);
+    public static TextExtractor getDefaultExtractor(Content content, Lookup context) {
+        TextExtractor stringsInstance = new StringsTextExtractor(content);
         stringsInstance.setExtractionSettings(context);
         return stringsInstance;
     }
@@ -100,13 +136,19 @@ public class TextExtractorFactory {
     /**
      * System level exception for handling content types that have no specific
      * strategy defined for extracting their text.
+     *
+     * @see
+     * org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)
+     * @see
+     * org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
+     * org.openide.util.Lookup)}
      */
     public static class NoTextExtractorFound extends Exception {
 
         public NoTextExtractorFound(String msg) {
             super(msg);
         }
-        
+
         public NoTextExtractorFound(Throwable ex) {
             super(ex);
         }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index f1df671715..b80bbe8b74 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -46,6 +46,7 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.openide.util.NbBundle;
 import org.openide.modules.InstalledFileLocator;
+import org.openide.util.Lookup;
 import org.sleuthkit.autopsy.coreutils.PlatformUtil;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.datamodel.Content;
@@ -55,7 +56,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
  * Extracts text from Tika supported content. Protects against Tika
  * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
  */
-final class TikaTextExtractor extends TextExtractor<Content> {
+final class TikaTextExtractor extends TextExtractor {
     
       //Mimetype groups to aassist extractor implementations in ignoring binary and 
     //archive files.
@@ -135,7 +136,7 @@ final class TikaTextExtractor extends TextExtractor<Content> {
      * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
      */
     @Override
-    public Reader getReader() throws InitReaderException {
+    public Reader getReader() throws ExtractionException {
         ReadContentInputStream stream = new ReadContentInputStream(content);
 
         Metadata metadata = new Metadata();
@@ -182,7 +183,7 @@ final class TikaTextExtractor extends TextExtractor<Content> {
             PushbackReader pushbackReader = new PushbackReader(tikaReader);
             int read = pushbackReader.read();
             if (read == -1) {
-                throw new InitReaderException("Unable to extract text: Tika returned empty reader for " + content);
+                throw new ExtractionException("Unable to extract text: Tika returned empty reader for " + content);
             }
             pushbackReader.unread(read);
 
@@ -191,13 +192,13 @@ final class TikaTextExtractor extends TextExtractor<Content> {
             return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
         } catch (TimeoutException te) {
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
-            throw new InitReaderException(msg, te);
-        } catch (InitReaderException ex) {
+            throw new ExtractionException(msg, te);
+        } catch (ExtractionException ex) {
             throw ex;
         } catch (Exception ex) {
             tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
-            throw new InitReaderException(msg, ex);
+            throw new ExtractionException(msg, ex);
         } finally {
             future.cancel(true);
         }
@@ -296,9 +297,12 @@ final class TikaTextExtractor extends TextExtractor<Content> {
      * @param context Instance containing config classes
      */
     @Override
-    public void setExtractionSettings(ExtractionContext context) {
-        if(context != null && context.contains(ImageFileExtractionConfig.class)) {
-            ImageFileExtractionConfig configInstance = context.get(ImageFileExtractionConfig.class);
+    public void setExtractionSettings(Lookup context) {
+        if(context != null) {
+            ImageFileExtractionConfig configInstance = context.lookup(ImageFileExtractionConfig.class);
+            if(configInstance == null) {
+                return;
+            }
             if(Objects.nonNull(configInstance.getOCREnabled())) {
                 this.tesseractOCREnabled = configInstance.getOCREnabled();
             }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
index c852d7baa1..865058659f 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
@@ -22,17 +22,21 @@ import java.util.List;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 
 /**
- * Allows for configuration of the StringsTextExtractor (the default extractor
- * for all content types).
+ * Allows for configuration of the {@link TextExtractor} obtained from
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
+ *
+ * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
+ * @see org.openide.util.Lookup
  */
 public class DefaultExtractionConfig {
+
     private Boolean extractUTF8;
     private Boolean extractUTF16;
     private List<SCRIPT> extractScripts;
 
     /**
-     * Enables the UTF-8 encoding to be used during strings extraction.
-     * 
+     * Enables UTF-8 encoding to be used during extraction.
+     *
      * @param enabled Flag indicating if UTF-8 should be turned on
      */
     public void setExtractUTF8(boolean enabled) {
@@ -40,8 +44,8 @@ public class DefaultExtractionConfig {
     }
 
     /**
-     * Enables the UTF-16 encoding to be used during strings extraction.
-     * 
+     * Enables UTF-16 encoding to be used during extraction.
+     *
      * @param enabled Flag indicating if UTF-16 should be turned on
      */
     public void setExtractUTF16(boolean enabled) {
@@ -50,7 +54,7 @@ public class DefaultExtractionConfig {
 
     /**
      * Returns whether extracting with UTF-8 encoding should be done.
-     * 
+     *
      * @return Flag indicating if UTF-8 has been turned on/off
      */
     public Boolean getExtractUTF8() {
@@ -59,29 +63,31 @@ public class DefaultExtractionConfig {
 
     /**
      * Return whether extracting with UTF-16 encoding should be done.
-     * 
+     *
      * @return Flag indicating if UTF-16 has been turned on/off
      */
-    public Boolean getExtractUTF16() { 
+    public Boolean getExtractUTF16() {
         return extractUTF16;
     }
-    
+
     /**
-     * Sets the type of extraction scripts that will be used during this 
-     * extraction.
-     * 
+     * Sets the type of extraction scripts that will be used during this
+     * extraction. See
+     * {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT}
+     * for more information about available scripts.
+     *
      * @param scripts Desired set of scripts to be used during extraction
      */
     public void setExtractScripts(List<SCRIPT> scripts) {
         this.extractScripts = scripts;
     }
-    
+
     /**
      * Gets the desired set of scripts to be used during extraction.
-     * 
+     *
      * @return Set of extraction scripts to be used
      */
     public List<SCRIPT> getExtractScripts() {
         return this.extractScripts;
     }
-}
\ No newline at end of file
+}
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
index 8051c8c8c2..db6550becc 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
@@ -20,14 +20,23 @@ package org.sleuthkit.autopsy.textextractors.extractionconfigs;
 
 /**
  * Allows for configuration of image file extraction.
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractor}'s that use
+ * ImageFileExtractionConfig can be obtained through
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}
+ * or
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
+ *
+ * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
+ * @see org.openide.util.Lookup
  */
 public class ImageFileExtractionConfig {
+
     private Boolean OCREnabled;
-    
+
     /**
      * Enables OCR to be run on the text extractor responsible for handling
      * image files.
-     * 
+     *
      * @param enabled Flag indicating if OCR is enabled.
      */
     public void setOCREnabled(boolean enabled) {
@@ -36,7 +45,7 @@ public class ImageFileExtractionConfig {
 
     /**
      * Gets the OCR flag that has been set. By default this flag is turned off.
-     * 
+     *
      * @return Flag indicating if OCR is enabled.
      */
     public boolean getOCREnabled() {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index edb9e12b86..d55fe0ae52 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -26,8 +26,10 @@ import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
 import org.openide.util.Exceptions;
+import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
+import org.openide.util.lookup.Lookups;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.coreutils.Logger;
@@ -43,7 +45,6 @@ import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorEx
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
-import org.sleuthkit.autopsy.textextractors.ExtractionContext;
 import org.sleuthkit.autopsy.textextractors.InitReaderException;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
@@ -144,7 +145,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     //accessed read-only by searcher thread
 
     private boolean startedSearching = false;
-    private ExtractionContext stringsExtractionContext;
+    private Lookup stringsExtractionContext;
     private final KeywordSearchJobSettings settings;
     private boolean initialized = false;
     private long jobId;
@@ -289,8 +290,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 }
             }
         }
-
-        stringsExtractionContext = new ExtractionContext();
         
         DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
         Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
@@ -298,7 +297,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
         stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
         
-        stringsExtractionContext.set(DefaultExtractionConfig.class, stringsConfig);
+        stringsExtractionContext = Lookups.fixed(stringsConfig);
         
         indexer = new Indexer();
         initialized = true;
@@ -479,11 +478,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
          * @throws IngesterException exception thrown if indexing failed
          */
         private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            ExtractionContext extractionContext = new ExtractionContext();
-            
             ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
             imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
-            extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
+            Lookup extractionContext = Lookups.fixed(imageConfig);
             
             try {
                 Reader specializedReader = TextExtractorFactory.getExtractor(aFile,extractionContext).getReader();

From ab86d881262e20b6b8dbf5292c9366b15a77d52b Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Mon, 10 Dec 2018 16:21:12 -0500
Subject: [PATCH 17/18] Added tons of documentation and made tesseract look for
 language packs programically

---
 .../textextractors/ArtifactTextExtractor.java |  4 +-
 .../textextractors/StringsTextExtractor.java  |  8 +-
 .../autopsy/textextractors/TextExtractor.java | 40 ++++----
 .../textextractors/TextExtractorFactory.java  |  3 +-
 .../textextractors/TikaTextExtractor.java     | 95 +++++++++++++------
 .../DefaultExtractionConfig.java              |  9 +-
 .../ImageFileExtractionConfig.java            |  2 +-
 .../KeywordSearchIngestModule.java            |  6 +-
 .../keywordsearch/SolrSearchService.java      | 10 +-
 9 files changed, 114 insertions(+), 63 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index 78ad29bef7..ba91a6cc3a 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -35,11 +35,11 @@ import org.sleuthkit.datamodel.TskCoreException;
 class ArtifactTextExtractor extends TextExtractor {
 
     private final BlackboardArtifact artifact;
-    
+
     public ArtifactTextExtractor(Content artifact) {
         this.artifact = (BlackboardArtifact) artifact;
     }
-    
+
     @Override
     public Reader getReader() throws ExtractionException {
         // Concatenate the string values of all attributes into a single
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 7ed6683c08..899cec9ef2 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -107,9 +107,9 @@ final class StringsTextExtractor extends TextExtractor {
      */
     @Override
     public void setExtractionSettings(Lookup context) {
-        if(context != null) {
+        if (context != null) {
             DefaultExtractionConfig configInstance = context.lookup(DefaultExtractionConfig.class);
-            if(configInstance == null) {
+            if (configInstance == null) {
                 return;
             }
             if (Objects.nonNull(configInstance.getExtractUTF8())) {
@@ -125,8 +125,8 @@ final class StringsTextExtractor extends TextExtractor {
     }
 
     /**
-     * 
-     * @return 
+     *
+     * @return
      */
     @Override
     public boolean isEnabled() {
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index d548ea39ec..d75d4bb1a6 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -56,12 +56,14 @@ public abstract class TextExtractor {
 
     /**
      * Get a {@link java.io.Reader} that will iterate over the text extracted
-     * from the {@link org.sleuthkit.datamodel.Content} used to create this
-     * TextExtractor instance.
+     * from the {@link org.sleuthkit.datamodel.Content} passed into
+     * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}.
      *
      * @return {@link java.io.Reader} that contains the text of the underlying
      *         {@link org.sleuthkit.datamodel.Content}
-     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
+     *
+     * @throws
+     * org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
      *
      * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
      *
@@ -77,23 +79,25 @@ public abstract class TextExtractor {
     void setExtractionSettings(Lookup context) {
         //no-op by default
     }
-    
+
     /**
-    * Exception encountered during {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
-    * This indicates that there was an internal parsing error that occurred during the 
-    */
-   public class ExtractionException extends Exception {
+     * Exception encountered during
+     * {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
+     * This indicates that there was an internal parsing error that occurred
+     * during the reading of Content text.
+     */
+    public class ExtractionException extends Exception {
 
-       public ExtractionException(String msg, Throwable ex) {
-           super(msg, ex);
-       }
+        public ExtractionException(String msg, Throwable ex) {
+            super(msg, ex);
+        }
 
-       public ExtractionException(Throwable ex) {
-           super(ex);
-       }
+        public ExtractionException(Throwable ex) {
+            super(ex);
+        }
 
-       public ExtractionException(String msg) {
-           super(msg);
-       }
-   }
+        public ExtractionException(String msg) {
+            super(msg);
+        }
+    }
 }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index 13ded777d6..10f2e84ba0 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -52,7 +52,8 @@ public class TextExtractorFactory {
      * @param context Contains extraction configurations for certain file types
      *
      * @return A TextExtractor that supports the given content. File text can be
-     *         obtained from {@link TextExtractor#getReader()}.
+     *         obtained from
+     *         {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
      *
      * @throws NoTextExtractorFound Encountered when there is no TextExtractor
      *                              was found for the given content type. Use {@link
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index b80bbe8b74..8ba64e4f0d 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -25,6 +25,8 @@ import java.io.IOException;
 import java.io.PushbackReader;
 import java.io.Reader;
 import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Objects;
 import java.util.concurrent.ExecutorService;
@@ -35,6 +37,7 @@ import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
+import org.apache.commons.io.FilenameUtils;
 import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
@@ -53,12 +56,12 @@ import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
 /**
- * Extracts text from Tika supported content. Protects against Tika
- * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
+ * Extracts text from Tika supported content. Protects against Tika parser hangs
+ * (for unexpected/corrupt content) using a timeout mechanism.
  */
 final class TikaTextExtractor extends TextExtractor {
-    
-      //Mimetype groups to aassist extractor implementations in ignoring binary and 
+
+    //Mimetype groups to aassist extractor implementations in ignoring binary and 
     //archive files.
     private static final List<String> BINARY_MIME_TYPES
             = ImmutableList.of(
@@ -66,8 +69,10 @@ final class TikaTextExtractor extends TextExtractor {
                     "application/octet-stream", //NON-NLS
                     "application/x-msdownload"); //NON-NLS
 
-    /** generally text extractors should ignore archives and let unpacking
-     * modules take care of them */
+    /**
+     * generally text extractors should ignore archives and let unpacking
+     * modules take care of them
+     */
     private static final List<String> ARCHIVE_MIME_TYPES
             = ImmutableList.of(
                     //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
@@ -103,15 +108,14 @@ final class TikaTextExtractor extends TextExtractor {
                     "application/x-z", //NON-NLS
                     "application/x-compress"); //NON-NLS
 
-    
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
-    
+
     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
 
     private final AutoDetectParser parser = new AutoDetectParser();
     private final Content content;
-    
+
     private boolean tesseractOCREnabled;
     private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
     private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
@@ -126,14 +130,17 @@ final class TikaTextExtractor extends TextExtractor {
     public TikaTextExtractor(Content content) {
         this.content = content;
     }
+
     /**
-     * Returns a reader that will iterate over the text extracted from Apache 
-     * Tika. 
-     * 
+     * Returns a reader that will iterate over the text extracted from Apache
+     * Tika.
+     *
      * @param content Supported source content to extract
+     *
      * @return Reader that contains Apache Tika extracted text
-     * 
-     * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException 
+     *
+     * @throws
+     * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
      */
     @Override
     public Reader getReader() throws ExtractionException {
@@ -149,28 +156,29 @@ final class TikaTextExtractor extends TextExtractor {
         officeParserConfig.setUseSAXPptxExtractor(true);
         officeParserConfig.setUseSAXDocxExtractor(true);
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
-        
+
         // configure OCR if it is enabled in KWS settings and installed on the machine
         if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
-            
+
             // configure PDFParser. 
             PDFParserConfig pdfConfig = new PDFParserConfig();
-            
+
             // Extracting the inline images and letting Tesseract run on each inline image.
             // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
             // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
-            pdfConfig.setExtractInlineImages(true); 
+            pdfConfig.setExtractInlineImages(true);
             // Multiple pages within a PDF file might refer to the same underlying image.
-            pdfConfig.setExtractUniqueInlineImagesOnly(true);            
+            pdfConfig.setExtractUniqueInlineImagesOnly(true);
             parseContext.set(PDFParserConfig.class, pdfConfig);
-            
+
             // Configure Tesseract parser to perform OCR
             TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
             String tesseractFolder = TESSERACT_PATH.getParent();
             ocrConfig.setTesseractPath(tesseractFolder);
             // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
             // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
-            ocrConfig.setLanguage("eng");
+            System.out.println(getLanguagePacks());
+            ocrConfig.setLanguage(getLanguagePacks());
             parseContext.set(TesseractOCRConfig.class, ocrConfig);
         }
 
@@ -246,9 +254,10 @@ final class TikaTextExtractor extends TextExtractor {
 
     /**
      * Determines if Tika is supported for this content type and mimetype.
-     * 
-     * @param content Source content to read
+     *
+     * @param content        Source content to read
      * @param detectedFormat Mimetype of content
+     *
      * @return Flag indicating support for reading content type
      */
     @Override
@@ -264,6 +273,36 @@ final class TikaTextExtractor extends TextExtractor {
         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
     }
 
+    /**
+     * Retrieves all of the installed language packs from their designated
+     * directory location to be used to configure Tesseract OCR.
+     *
+     * @return String of all language packs available for Tesseract to use
+     */
+    private String getLanguagePacks() {
+        File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
+        //Acceptable extensions for Tesseract-OCR version 3.05 language packs.
+        //All extensions other than traineddata are associated with cube files that
+        //have been made obsolete since version 4.0.
+        List<String> acceptableExtensions = Arrays.asList("traineddata", "params",
+                "lm", "fold", "bigrams", "nn", "word-freq", "size",
+                "user-patterns", "user-words");
+        //Pull out only unique languagePacks
+        HashSet<String> languagePacks = new HashSet<>();
+        if (languagePackRootDir.exists()) {
+            for (File languagePack : languagePackRootDir.listFiles()) {
+                if (languagePack.isDirectory() || !acceptableExtensions.contains(
+                        FilenameUtils.getExtension(languagePack.getName()))) {
+                    continue;
+                }
+                String threeLetterPackageName = languagePack.getName().substring(0, 3);
+                //Ignore the eng language pack if accidentally added
+                languagePacks.add(threeLetterPackageName);
+            }
+        }
+        return String.join("+", languagePacks);
+    }
+
     /**
      * Return timeout that should be used to index the content.
      *
@@ -288,7 +327,7 @@ final class TikaTextExtractor extends TextExtractor {
     }
 
     /**
-     * Determines how the extraction process will proceed given the settings 
+     * Determines how the extraction process will proceed given the settings
      * stored in this context instance.
      *
      * See the ImageFileExtractionConfig class in the extractionconfigs package
@@ -298,15 +337,15 @@ final class TikaTextExtractor extends TextExtractor {
      */
     @Override
     public void setExtractionSettings(Lookup context) {
-        if(context != null) {
+        if (context != null) {
             ImageFileExtractionConfig configInstance = context.lookup(ImageFileExtractionConfig.class);
-            if(configInstance == null) {
+            if (configInstance == null) {
                 return;
             }
-            if(Objects.nonNull(configInstance.getOCREnabled())) {
+            if (Objects.nonNull(configInstance.getOCREnabled())) {
                 this.tesseractOCREnabled = configInstance.getOCREnabled();
             }
-        }    
+        }
     }
 
     /**
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
index 865058659f..4e5a08a8ba 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/DefaultExtractionConfig.java
@@ -22,10 +22,17 @@ import java.util.List;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 
 /**
- * Allows for configuration of the {@link TextExtractor} obtained from
+ * Allows for configuration of the
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractor} obtained from
  * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
  *
+ * The default extractor will read strings from the Content instance. This class
+ * allows for the configuration of the encoding language script to use during
+ * extraction.
+ *
  * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
+ * @see
+ * org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
  * @see org.openide.util.Lookup
  */
 public class DefaultExtractionConfig {
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
index db6550becc..944e2f39fb 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java
@@ -19,7 +19,7 @@
 package org.sleuthkit.autopsy.textextractors.extractionconfigs;
 
 /**
- * Allows for configuration of image file extraction.
+ * Allows for configuration of OCR on image files.
  * {@link org.sleuthkit.autopsy.textextractors.TextExtractor}'s that use
  * ImageFileExtractionConfig can be obtained through
  * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index d55fe0ae52..09b274cbd3 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -45,8 +45,8 @@ import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorEx
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
-import org.sleuthkit.autopsy.textextractors.InitReaderException;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
@@ -486,7 +486,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                 Reader specializedReader = TextExtractorFactory.getExtractor(aFile,extractionContext).getReader();
                 //divide into chunks and index
                 return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
-            } catch (TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {
+            } catch (TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
                 //No text extractor found... run the default instead
                 return false;
             }
@@ -514,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                     putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
                     return false;
                 }
-            } catch (IngesterException | InitReaderException ex) {
+            } catch (IngesterException | ExtractionException ex) {
                 logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex);  //NON-NLS
                 putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
                 return false;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index ea178cd203..9a26e97924 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -46,7 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
 import org.sleuthkit.autopsy.progress.ProgressIndicator;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.autopsy.textextractors.InitReaderException;
+import org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.BlackboardArtifact;
@@ -121,7 +121,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
                 String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
                 ingester.indexMetaDataOnly(artifact, sourceName);
                 ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
-            } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {
+            } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
                 throw new TskCoreException(ex.getCause().getMessage(), ex);
             }
         } else {
@@ -129,11 +129,11 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
                 Reader contentReader = TextExtractorFactory
                         .getExtractor(content, null).getReader();
                 ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
-            } catch (TextExtractorFactory.NoTextExtractorFound | InitReaderException | Ingester.IngesterException ex) {
+            } catch (TextExtractorFactory.NoTextExtractorFound | ExtractionException | Ingester.IngesterException ex) {
                 try {
                     // Try the StringsTextExtractor if Tika extractions fails.
                     ingester.indexText(TextExtractorFactory.getDefaultExtractor(content, null).getReader(),content.getId(),content.getName(), content, null);
-                } catch (Ingester.IngesterException | InitReaderException ex1) {
+                } catch (Ingester.IngesterException | ExtractionException ex1) {
                     throw new TskCoreException(ex.getCause().getMessage(), ex1);
                 }
             }
@@ -451,7 +451,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
                     TextExtractorFactory.getExtractor((Content) artifact, null).getReader();
             ingester.indexMetaDataOnly(artifact, sourceName);
             ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
-        } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | InitReaderException ex) {
+        } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
             throw new TskCoreException(ex.getCause().getMessage(), ex);
         }
     }

From 9d15b5f5719a2d75d744b4ea2d828f17957f6711 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Mon, 10 Dec 2018 16:37:55 -0500
Subject: [PATCH 18/18] Fixed language pack look up to happen once at class
 loading

---
 .../sleuthkit/autopsy/textextractors/TikaTextExtractor.java | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 8ba64e4f0d..9b766a9e9e 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -120,6 +120,7 @@ final class TikaTextExtractor extends TextExtractor {
     private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
     private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
     private static final File TESSERACT_PATH = locateTesseractExecutable();
+    private static final String LANGUAGE_PACKS = getLanguagePacks();
 
     private static final List<String> TIKA_SUPPORTED_TYPES
             = new Tika().getParser().getSupportedTypes(new ParseContext())
@@ -177,8 +178,7 @@ final class TikaTextExtractor extends TextExtractor {
             ocrConfig.setTesseractPath(tesseractFolder);
             // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
             // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
-            System.out.println(getLanguagePacks());
-            ocrConfig.setLanguage(getLanguagePacks());
+            ocrConfig.setLanguage(LANGUAGE_PACKS);
             parseContext.set(TesseractOCRConfig.class, ocrConfig);
         }
 
@@ -279,7 +279,7 @@ final class TikaTextExtractor extends TextExtractor {
      *
      * @return String of all language packs available for Tesseract to use
      */
-    private String getLanguagePacks() {
+    private static String getLanguagePacks() {
         File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
         //Acceptable extensions for Tesseract-OCR version 3.05 language packs.
         //All extensions other than traineddata are associated with cube files that