Merge pull request #5536 from sleuthkit/release-4.14.0

Merge release-4.14.0 branch onto develop branch
2025-07-17 18:17:43 +00:00 · 2019-12-23 12:19:07 -05:00 · 2019-12-23 12:19:07 -05:00 · 335b0791cd
commit 335b0791cd
parent e60d1a6ae9 55863a3ce6
3 changed files with 75 additions and 29 deletions
--- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
@ -254,10 +254,10 @@ public class FileTypeDetector {
                } else {
                    /*
                     * If the file was marked as an octet stream and the extension is .txt, try to detect a text
-                     * encoding with Decodetect.
+                     * encoding 
                     */
                    if (file.getNameExtension().equals("txt")) {
-                        Charset detectedCharset = TextFileExtractor.getEncoding(file);
+                        Charset detectedCharset = new TextFileExtractor(file).getEncoding();
                        if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
                            mimeType = MimeTypes.PLAIN_TEXT;
                        }
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@ -85,14 +85,14 @@ public class TextExtractorFactory {
     * @param content AbstractFile content
     * @param context Lookup containing extractor configurations
     *
-     * @return
+     * @return List of all extractors in priority order. Not all will support the passed in content.   @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED
     */
    private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
        List<TextExtractor> fileExtractors = Arrays.asList(
                new TextFileExtractor(content),
                new HtmlTextExtractor(content),
                new SqliteTextExtractor(content),
-                new TikaTextExtractor(content));
+                new TikaTextExtractor(content));   /// This should go last to ensure the more specific ones are picked first. 

        fileExtractors.forEach((fileExtractor) -> {
            fileExtractor.setExtractionSettings(context);
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
@ -31,17 +31,24 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.StandardCharsets;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.List;
+import java.util.logging.Level;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
+import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
+import org.sleuthkit.datamodel.TskCoreException;

 /**
- * Extract text from text files
+ * A TextExtractor that is used to extract text from a text file.
 */
 public final class TextFileExtractor implements TextExtractor {
-    public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
+
+    /*
+     * The char set returned if a text file extractor fails to detect the
+     * encoding of the file from which it is extracting text.
+     */
+    public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
        @Override
        public boolean contains(Charset cs) {
            return false;
@ -59,33 +66,45 @@ public final class TextFileExtractor implements TextExtractor {
    };

    // This value will be used as a threshold for determining which encoding
-    // detection library to use. If Tika's own confidence is at least
-    // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
+    // detection library to use. If CharsetDetector's own confidence is at least
+    // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
    // Otherwise, Decodetect will be used.
-    static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
+    // 
+    // Note: We initially used a confidence of 35, but it was causing some 
+    // Chrome Cache files to get flagged as UTF-16 with confidence 40. 
+    // These files had a small amount of binary data and then ASCII. 
+    static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;

    // This value determines whether we will consider Decodetect's top-scoring
-    // result a legitimate match or if we will disregard its findings
+    // result a legitimate match or if we will disregard its findings.
    //
-    // Possible values are 0 to 1, inclusive
+    // Possible values are 0 to 1, inclusive.
    static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;

+    private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
    private final AbstractFile file;

+    private Charset encoding = null;
+
+    /**
+     * Constructs a TextExtractor that is used to extract text from a text file.
+     *
+     * @param file The file.
+     */
    public TextFileExtractor(AbstractFile file) {
        this.file = file;
    }

    @Override
    public Reader getReader() {
-        Charset encoding = getEncoding(file);
-        if (encoding.equals(UNKNOWN_CHARSET)) {
-            encoding = StandardCharsets.UTF_8;
+        Charset enc = getEncoding();
+        if (enc.equals(UNKNOWN_CHARSET)) {
+            enc = StandardCharsets.UTF_8;
        }
-        return getReader(encoding);
+        return getReader(enc);
    }

-    public Reader getReader(Charset encoding) {
+    private Reader getReader(Charset encoding) {
        return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
    }

@ -94,33 +113,60 @@ public final class TextFileExtractor implements TextExtractor {
        return file.getMIMEType().equals("text/plain");
    }

-    public static Charset getEncoding(Content content) {
-        try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
-            // Tika first
+    /**
+     * Returns the encoding of the file.
+     *
+     * @return Detected encoding or UNKNOWN_CHARSET.
+     */
+    public Charset getEncoding() {
+        if (encoding != null) {
+            return encoding;
+        }
+
+        // Encoding detection is hard. We use several libraries since the data passed in is often messy.
+        // First try CharsetDetector (from Tika / ICU4J).
+        // It is a rule-based detection approach.
+        try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
            CharsetDetector detector = new CharsetDetector();
            detector.setText(stream);
            CharsetMatch tikaResult = detector.detect();
-            if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
+            if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
                try {
-                    return Charset.forName(tikaResult.getName());
-                } catch (UnsupportedCharsetException ignored) {
+                    encoding = Charset.forName(tikaResult.getName());
+                    return encoding;
+                } catch (UnsupportedCharsetException ex) {
+                    logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex);
                }
            }
+        } catch (IOException ex) {
+            logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex);
+        }

-            // Decodetect if Tika fails or falls below confidence threshold
+        // If that did not work, then use DecoDetect, which is stastical 
+        // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
+        // This will not always work with messy data that combines some binary and some ASCII.
+        try {
            int maxBytes = 100000;
-            int numBytes = Math.min(stream.available(), maxBytes);
+            int numBytes = maxBytes;
+            if (file.getSize() < maxBytes) {
+                numBytes = (int) file.getSize();
+            }
+
            byte[] targetArray = new byte[numBytes];
-            stream.read(targetArray);
+            file.read(targetArray, 0, numBytes);
            List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
            if (!results.isEmpty()) {
                DecodetectResult topResult = results.get(0);
                if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
-                    return topResult.getEncoding();
+                    encoding = topResult.getEncoding();
+                    return encoding;
                }
            }
-        } catch (IOException ignored) {
+        } catch (TskCoreException ex) {
+            logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex);
        }
-        return UNKNOWN_CHARSET;
+
+        encoding = UNKNOWN_CHARSET;
+        return encoding;
    }
 }