Merge pull request #5125 from ethteck/better-encoding-detection

Added decodetect library for better encoding detection on text files
2025-07-17 18:17:43 +00:00 · 2019-12-06 18:42:56 -05:00 · 2019-12-06 18:42:56 -05:00 · 754fd8c9d0
commit 754fd8c9d0
parent 96b8e29a02 3d84c47d91
8 changed files with 191 additions and 85 deletions
--- a/Core/ivy.xml
+++ b/Core/ivy.xml
@ -43,6 +43,9 @@
        <dependency conf="core->default" org="com.google.cloud" name="google-cloud-translate" rev="1.70.0"/>
        <dependency conf="core->default" org="org.apache.opennlp" name="opennlp-tools" rev="1.9.1"/>

+        <dependency conf="core->default" org="com.ethteck.decodetect" name="decodetect-core" rev="0.3"/>
+        <dependency conf="core->default" org="com.beetstra.jutf7" name="jutf7" rev="1.0.0"/>
+        
        <dependency org="org.sejda.webp-imageio" name="webp-imageio-sejda" rev="0.1.0"/>
        <dependency org="com.googlecode.libphonenumber" name="libphonenumber" rev="3.5" />
        <dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/>
--- a/Core/nbproject/project.properties
+++ b/Core/nbproject/project.properties
@ -18,6 +18,7 @@ file.reference.commons-lang3-3.8.1.jar=release\\modules\\ext\\commons-lang3-3.8.
 file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
 file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar
 file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar
+file.reference.decodetect-core-0.3.jar=release\\modules\\ext\\decodetect-core-0.3.jar
 file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar
 file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar
 file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar
@ -50,6 +51,7 @@ file.reference.jsoup-1.11.3.jar=release\\modules\\ext\\jsoup-1.11.3.jar
 file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar
 file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar
 file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar
+file.reference.jutf7-1.0.0.jar=release\\modules\\ext\\jutf7-1.0.0.jar
 file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar
 file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
 file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@ -794,6 +794,14 @@
                <runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
                <binary-origin>release\modules\ext\vorbis-java-tika-0.8.jar</binary-origin>
            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/decodetect-core-0.3.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/decodetect-core-0.3.jar</binary-origin>
+            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/jutf7-1.0.0.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/jutf7-1.0.0.jar</binary-origin>
+            </class-path-extension>
        </data>
    </configuration>
 </project>
--- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
@ -18,6 +18,7 @@
 */
 package org.sleuthkit.autopsy.modules.filetypeid;

+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@ -29,6 +30,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.mime.MimeTypes;
 import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.sleuthkit.datamodel.TskCoreException;
@ -249,6 +251,17 @@ public class FileTypeDetector {
                        mimeType = tikaType.replace("tika-", ""); //NON-NLS
                        mimeType = removeOptionalParameter(mimeType);
                    }
+                } else {
+                    /*
+                     * If the file was marked as an octet stream and the extension is .txt, try to detect a text
+                     * encoding with Decodetect.
+                     */
+                    if (file.getNameExtension().equals("txt")) {
+                        Charset detectedCharset = TextFileExtractor.getEncoding(file);
+                        if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
+                            mimeType = MimeTypes.PLAIN_TEXT;
+                        }
+                    }
                }

                /**
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@ -89,6 +89,7 @@ public class TextExtractorFactory {
     */
    private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
        List<TextExtractor> fileExtractors = Arrays.asList(
+                new TextFileExtractor(content),
                new HtmlTextExtractor(content),
                new SqliteTextExtractor(content),
                new TikaTextExtractor(content));
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
@ -0,0 +1,135 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2019 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.textextractors;
+
+import com.ethteck.decodetect.core.Decodetect;
+import com.ethteck.decodetect.core.DecodetectResult;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.List;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.ReadContentInputStream;
+
+/**
+ * Extract text from text files
+ */
+public final class TextFileExtractor implements TextExtractor {
+    public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
+        @Override
+        public boolean contains(Charset cs) {
+            return false;
+        }
+
+        @Override
+        public CharsetDecoder newDecoder() {
+            return null;
+        }
+
+        @Override
+        public CharsetEncoder newEncoder() {
+            return null;
+        }
+    };
+
+    // This value will be used as a threshold for determining which encoding
+    // detection library to use. If Tika's own confidence is at least
+    // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
+    // Otherwise, Decodetect will be used.
+    static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
+
+    // This value determines whether we will consider Decodetect's top-scoring
+    // result a legitimate match or if we will disregard its findings
+    //
+    // Possible values are 0 to 1, inclusive
+    static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
+
+    private final AbstractFile file;
+
+    public TextFileExtractor(AbstractFile file) {
+        this.file = file;
+    }
+
+    @Override
+    public Reader getReader() {
+        Charset encoding = getEncoding(file);
+        if (encoding.equals(UNKNOWN_CHARSET)) {
+            encoding = StandardCharsets.UTF_8;
+        }
+        return getReader(encoding);
+    }
+
+    public Reader getReader(Charset encoding) {
+        return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
+    }
+
+    @Override
+    public boolean isSupported() {
+        return file.getMIMEType().equals("text/plain");
+    }
+
+    public class TextFileExtractorException extends Exception {
+        public TextFileExtractorException(String msg, Throwable ex) {
+            super(msg, ex);
+        }
+        public TextFileExtractorException(String msg) {
+            super(msg);
+        }
+    }
+
+    public static Charset getEncoding(Content content) {
+        try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
+            // Tika first
+            CharsetDetector detector = new CharsetDetector();
+            detector.setText(stream);
+            CharsetMatch tikaResult = detector.detect();
+            if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
+                try {
+                    return Charset.forName(tikaResult.getName());
+                } catch (UnsupportedCharsetException ignored) {
+                }
+            }
+
+            // Decodetect if Tika fails or falls below confidence threshold
+            int maxBytes = 100000;
+            int numBytes = Math.min(stream.available(), maxBytes);
+            byte[] targetArray = new byte[numBytes];
+            stream.read(targetArray);
+            List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
+            if (!results.isEmpty()) {
+                DecodetectResult topResult = results.get(0);
+                if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
+                    return topResult.getEncoding();
+                }
+            }
+        } catch (IOException ignored) {
+        }
+        return UNKNOWN_CHARSET;
+    }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -28,6 +28,7 @@ import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
+import org.apache.tika.mime.MimeTypes;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
@ -44,12 +45,12 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
 import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
 import org.sleuthkit.autopsy.ingest.IngestServices;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
-import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
 import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
 import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
 import org.sleuthkit.datamodel.AbstractFile;
@ -632,7 +633,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                if (context.fileIngestIsCancelled()) {
                    return;
                }
-                if (fileType.equals("application/octet-stream")) {
+                if (fileType.equals(MimeTypes.OCTET_STREAM)) {
                    extractStringsAndIndex(aFile);
                    return;
                }
@ -657,20 +658,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
                //Carved Files should be the only type of unallocated files capable of a txt extension and 
                //should be ignored by the TextFileExtractor because they may contain more than one text encoding
-                try {
-                    TextFileExtractor textFileExtractor = new TextFileExtractor();
-                    Reader textReader = textFileExtractor.getReader(aFile);
-                    if (textReader == null) {
-                        logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
-                    } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
-                        putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
-                        wasTextAdded = true;
-                    }
-                } catch (IngesterException ex) {
-                    logger.log(Level.WARNING, "Unable to index as unicode", ex);
-                } catch (TextFileExtractorException ex) {
-                    logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
-                }
+                wasTextAdded = indexTextFile(aFile);
            }

            // if it wasn't supported or had an error, default to strings
@ -678,5 +666,29 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                extractStringsAndIndex(aFile);
            }
        }
+
+        /**
+         * Adds the text file to the index given an encoding.
+         * Returns true if indexing was successful and false otherwise.
+         *
+         * @param aFile Text file to analyze
+         * @param detectedCharset the encoding of the file
+         */
+        private boolean indexTextFile(AbstractFile aFile) {
+            try {
+                TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
+                Reader textReader = textFileExtractor.getReader();
+                if (textReader == null) {
+                    logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
+                } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
+                    textReader.close();
+                    putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
+                    return true;
+                }
+            } catch (IngesterException | IOException ex) {
+                logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
+            }
+            return false;
+        }
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@ -1,68 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2018-2019 Basis Technology Corp.
- * Contact: carrier <at> sleuthkit <dot> org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.BufferedInputStream;
-import java.io.Reader;
-import org.apache.tika.parser.txt.CharsetDetector;
-import org.apache.tika.parser.txt.CharsetMatch;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.ReadContentInputStream;
-
-/**
- * Extract text from .txt files
- */
-final class TextFileExtractor {
-
-    //Set a Minimum confidence value to reject matches that may not have a valid text encoding
-    //Values of valid text encodings were generally 100, xml code sometimes had a value around 50, 
-    //and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
-    //This limited information was used to select the current value as one that would filter out clearly non-text 
-    //files while hopefully working on all files with a valid text encoding
-    static final private int MIN_MATCH_CONFIDENCE = 20;
-
-    public Reader getReader(AbstractFile source) throws TextFileExtractorException {
-        CharsetDetector detector = new CharsetDetector();
-        //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
-        InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
-        try {
-            detector.setText(stream);
-        } catch (IOException ex) {
-            throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
-        }
-        CharsetMatch match = detector.detect();
-        if (match == null) {
-            throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
-        } else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
-            throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
-        }
-
-        return match.getReader();
-    }
-    
-    public class TextFileExtractorException extends Exception {
-        public TextFileExtractorException(String msg, Throwable ex) {
-            super(msg, ex);
-        }
-        public TextFileExtractorException(String msg) {
-            super(msg);
-        }
-    }
-}