From 684aeaffa4b1bea21b1c8bbbb182922639c44961 Mon Sep 17 00:00:00 2001
From: Brian Carrier <carrier@sleuthkit.org>
Date: Tue, 17 Dec 2019 17:03:40 -0500
Subject: [PATCH 1/4] Added comment

---
 .../autopsy/textextractors/TextExtractorFactory.java          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index ff0ba51dd1..9bc5af74bd 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -85,14 +85,14 @@ public class TextExtractorFactory {
      * @param content AbstractFile content
      * @param context Lookup containing extractor configurations
      *
-     * @return
+     * @return List of all extractors in priority order. Not all will support the passed in content.   @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED
      */
     private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
         List<TextExtractor> fileExtractors = Arrays.asList(
                 new TextFileExtractor(content),
                 new HtmlTextExtractor(content),
                 new SqliteTextExtractor(content),
-                new TikaTextExtractor(content));
+                new TikaTextExtractor(content));   /// This should go last to ensure the more specific ones are picked first. 
 
         fileExtractors.forEach((fileExtractor) -> {
             fileExtractor.setExtractionSettings(context);

From 69e8783869e41f90abd9cfe74a9a29feb5b93494 Mon Sep 17 00:00:00 2001
From: Brian Carrier <carrier@sleuthkit.org>
Date: Thu, 19 Dec 2019 17:15:21 -0500
Subject: [PATCH 2/4] change encoding threshold to better deal with
 binary/ASCII cache files - JIRA 5894.  Refactor

---
 .../modules/filetypeid/FileTypeDetector.java  |  4 +-
 .../textextractors/TextFileExtractor.java     | 74 ++++++++++++++-----
 2 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
index 46b60d9b1e..477788df21 100644
--- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
@@ -254,10 +254,10 @@ public class FileTypeDetector {
                 } else {
                     /*
                      * If the file was marked as an octet stream and the extension is .txt, try to detect a text
-                     * encoding with Decodetect.
+                     * encoding 
                      */
                     if (file.getNameExtension().equals("txt")) {
-                        Charset detectedCharset = TextFileExtractor.getEncoding(file);
+                        Charset detectedCharset = new TextFileExtractor(file).getEncoding();
                         if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
                             mimeType = MimeTypes.PLAIN_TEXT;
                         }
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
index 3efb6b1aed..1656112c68 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
@@ -31,11 +31,14 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.StandardCharsets;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.List;
+import java.util.logging.Level;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
+import org.sleuthkit.datamodel.TskCoreException;
 
 /**
  * Extract text from text files
@@ -59,10 +62,12 @@ public final class TextFileExtractor implements TextExtractor {
     };
 
     // This value will be used as a threshold for determining which encoding
-    // detection library to use. If Tika's own confidence is at least
-    // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
+    // detection library to use. If CharsetDetector's own confidence is at least
+    // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
     // Otherwise, Decodetect will be used.
-    static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
+    // - We had 35, but it was causing some Chrome Cache files to get flagged as UTF-16 with confidence 40. 
+    //    They had a small amount of binary data and then ASCII. 
+    static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
 
     // This value determines whether we will consider Decodetect's top-scoring
     // result a legitimate match or if we will disregard its findings
@@ -70,7 +75,10 @@ public final class TextFileExtractor implements TextExtractor {
     // Possible values are 0 to 1, inclusive
     static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
 
+    private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
     private final AbstractFile file;
+    
+    private Charset encoding = null;
 
     public TextFileExtractor(AbstractFile file) {
         this.file = file;
@@ -78,14 +86,14 @@ public final class TextFileExtractor implements TextExtractor {
 
     @Override
     public Reader getReader() {
-        Charset encoding = getEncoding(file);
-        if (encoding.equals(UNKNOWN_CHARSET)) {
-            encoding = StandardCharsets.UTF_8;
+        Charset enc = getEncoding();
+        if (enc.equals(UNKNOWN_CHARSET)) {
+            enc = StandardCharsets.UTF_8;
         }
-        return getReader(encoding);
+        return getReader(enc);
     }
 
-    public Reader getReader(Charset encoding) {
+    private Reader getReader(Charset encoding) {
         return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
     }
 
@@ -103,33 +111,59 @@ public final class TextFileExtractor implements TextExtractor {
         }
     }
 
-    public static Charset getEncoding(Content content) {
-        try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
-            // Tika first
+    /**
+     * Return the encoding of the file
+     * @return Detected encoding or UNKNOWN_CHARSET 
+     */
+    public Charset getEncoding() {
+        if (encoding != null)
+            return encoding;
+        
+        // Encoding detection is hard. We use several libraries since the data passed in is often messy.
+        
+        // First try CharsetDetector (from Tika / ICU4J)
+        // It is a rule-baesd detection approach
+        try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
             CharsetDetector detector = new CharsetDetector();
             detector.setText(stream);
             CharsetMatch tikaResult = detector.detect();
-            if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
+            if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
                 try {
-                    return Charset.forName(tikaResult.getName());
-                } catch (UnsupportedCharsetException ignored) {
+                    encoding = Charset.forName(tikaResult.getName());
+                    return encoding;
+                } catch (UnsupportedCharsetException ex) {
+                    logger.log(Level.WARNING, "Error converting CharsetDetector Result", ex);
                 }
             }
+        } catch (IOException ignored) {
+            // IGNORE READ ERRORS HERE - Assume they were logged elsewhere
+        }
 
-            // Decodetect if Tika fails or falls below confidence threshold
+        // If that did not work, then use DecoDetect, which is stastical 
+        // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
+        // This will not always work with messy data that combines some binary and some ASCII.
+        try {
             int maxBytes = 100000;
-            int numBytes = Math.min(stream.available(), maxBytes);
+            int numBytes = maxBytes;
+            if (file.getSize() < maxBytes) {
+                numBytes = (int) file.getSize();
+            }
+            
             byte[] targetArray = new byte[numBytes];
-            stream.read(targetArray);
+            file.read(targetArray, 0, numBytes);
             List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
             if (!results.isEmpty()) {
                 DecodetectResult topResult = results.get(0);
                 if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
-                    return topResult.getEncoding();
+                    encoding = topResult.getEncoding();
+                    return encoding;
                 }
             }
-        } catch (IOException ignored) {
+        } catch (TskCoreException ex) {
+            // IGNORE READ ERRORS HERE - Assume they were logged elsewhere
         }
-        return UNKNOWN_CHARSET;
+        
+        encoding = UNKNOWN_CHARSET;
+        return encoding;
     }
 }

From f9445ff7d871eee4f5593f0587e05a09df400db6 Mon Sep 17 00:00:00 2001
From: Richard Cordovano <rcordovano@basistech.com>
Date: Mon, 23 Dec 2019 12:08:18 -0500
Subject: [PATCH 3/4] Clean up in TextFileExtractor

---
 .../textextractors/TextFileExtractor.java     | 53 +++++++++++--------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
index 9ad5098524..aa99672b1d 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
@@ -36,15 +36,19 @@ import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.sleuthkit.datamodel.TskCoreException;
 
 /**
- * Extract text from text files
+ * A TextExtractor that is used to extract text from a text file.
  */
 public final class TextFileExtractor implements TextExtractor {
-    public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
+
+    /*
+     * The char set returned if a text file extractor fails to detect the
+     * encoding of the file from which it is extracting text.
+     */
+    public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
         @Override
         public boolean contains(Charset cs) {
             return false;
@@ -65,21 +69,27 @@ public final class TextFileExtractor implements TextExtractor {
     // detection library to use. If CharsetDetector's own confidence is at least
     // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
     // Otherwise, Decodetect will be used.
-    // - We had 35, but it was causing some Chrome Cache files to get flagged as UTF-16 with confidence 40. 
-    //    They had a small amount of binary data and then ASCII. 
+    // NOte: We initially used a confidence of 35, but it was causing some 
+    // Chrome Cache files to get flagged as UTF-16 with confidence 40. 
+    // These files had a small amount of binary data and then ASCII. 
     static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
 
     // This value determines whether we will consider Decodetect's top-scoring
-    // result a legitimate match or if we will disregard its findings
+    // result a legitimate match or if we will disregard its findings.
     //
-    // Possible values are 0 to 1, inclusive
+    // Possible values are 0 to 1, inclusive.
     static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
 
     private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
     private final AbstractFile file;
-    
+
     private Charset encoding = null;
 
+    /**
+     * Constructs a TextExtractor that is used to extract text from a text file.
+     *
+     * @param file The file.
+     */
     public TextFileExtractor(AbstractFile file) {
         this.file = file;
     }
@@ -103,17 +113,18 @@ public final class TextFileExtractor implements TextExtractor {
     }
 
     /**
-     * Return the encoding of the file
-     * @return Detected encoding or UNKNOWN_CHARSET 
+     * Returns the encoding of the file.
+     *
+     * @return Detected encoding or UNKNOWN_CHARSET.
      */
     public Charset getEncoding() {
-        if (encoding != null)
+        if (encoding != null) {
             return encoding;
-        
+        }
+
         // Encoding detection is hard. We use several libraries since the data passed in is often messy.
-        
-        // First try CharsetDetector (from Tika / ICU4J)
-        // It is a rule-baesd detection approach
+        // First try CharsetDetector (from Tika / ICU4J).
+        // It is a rule-baesd detection approach.
         try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
             CharsetDetector detector = new CharsetDetector();
             detector.setText(stream);
@@ -123,11 +134,11 @@ public final class TextFileExtractor implements TextExtractor {
                     encoding = Charset.forName(tikaResult.getName());
                     return encoding;
                 } catch (UnsupportedCharsetException ex) {
-                    logger.log(Level.WARNING, "Error converting CharsetDetector Result", ex);
+                    logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex);
                 }
             }
-        } catch (IOException ignored) {
-            // IGNORE READ ERRORS HERE - Assume they were logged elsewhere
+        } catch (IOException ex) {
+            logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex);
         }
 
         // If that did not work, then use DecoDetect, which is stastical 
@@ -139,7 +150,7 @@ public final class TextFileExtractor implements TextExtractor {
             if (file.getSize() < maxBytes) {
                 numBytes = (int) file.getSize();
             }
-            
+
             byte[] targetArray = new byte[numBytes];
             file.read(targetArray, 0, numBytes);
             List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
@@ -151,9 +162,9 @@ public final class TextFileExtractor implements TextExtractor {
                 }
             }
         } catch (TskCoreException ex) {
-            // IGNORE READ ERRORS HERE - Assume they were logged elsewhere
+            logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex);
         }
-        
+
         encoding = UNKNOWN_CHARSET;
         return encoding;
     }

From ad7e0ceaff0d6c796876dfb2ce314a7d5f37a3fa Mon Sep 17 00:00:00 2001
From: Richard Cordovano <rcordovano@basistech.com>
Date: Mon, 23 Dec 2019 12:15:45 -0500
Subject: [PATCH 4/4] Clean up in TextFileExtractor

---
 .../sleuthkit/autopsy/textextractors/TextFileExtractor.java  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
index aa99672b1d..e6c52fb19c 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
@@ -69,7 +69,8 @@ public final class TextFileExtractor implements TextExtractor {
     // detection library to use. If CharsetDetector's own confidence is at least
     // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
     // Otherwise, Decodetect will be used.
-    // NOte: We initially used a confidence of 35, but it was causing some 
+    // 
+    // Note: We initially used a confidence of 35, but it was causing some 
     // Chrome Cache files to get flagged as UTF-16 with confidence 40. 
     // These files had a small amount of binary data and then ASCII. 
     static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
@@ -124,7 +125,7 @@ public final class TextFileExtractor implements TextExtractor {
 
         // Encoding detection is hard. We use several libraries since the data passed in is often messy.
         // First try CharsetDetector (from Tika / ICU4J).
-        // It is a rule-baesd detection approach.
+        // It is a rule-based detection approach.
         try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
             CharsetDetector detector = new CharsetDetector();
             detector.setText(stream);