Merge pull request #5125 from ethteck/better-encoding-detection

Added decodetect library for better encoding detection on text files
2025-07-17 18:17:43 +00:00 · 2019-12-06 18:42:56 -05:00 · 2019-12-06 18:42:56 -05:00 · 754fd8c9d0
commit 754fd8c9d0
parent 96b8e29a02 3d84c47d91
8 changed files with 191 additions and 85 deletions
--- a/Core/ivy.xml
+++ b/Core/ivy.xml
@ -42,6 +42,9 @@
        <dependency conf="core->default" org="com.google.cloud" name="google-cloud-translate" rev="1.70.0"/>
        <dependency conf="core->default" org="org.apache.opennlp" name="opennlp-tools" rev="1.9.1"/>
        <dependency conf="core->default" org="com.ethteck.decodetect" name="decodetect-core" rev="0.3"/>
        <dependency conf="core->default" org="com.beetstra.jutf7" name="jutf7" rev="1.0.0"/>
        <dependency org="org.sejda.webp-imageio" name="webp-imageio-sejda" rev="0.1.0"/>
        <dependency org="com.googlecode.libphonenumber" name="libphonenumber" rev="3.5" />
@ -54,6 +57,6 @@
        <!-- https://mvnrepository.com/artifact/javax.ws.rs/javax.ws.rs-api -->
        <dependency org="javax.ws.rs" name="javax.ws.rs-api" rev="2.0"/>
-        <override org="jakarta.ws.rs" module="jakarta.ws.rs-api" rev="2.1.5"/>     
+        <override org="jakarta.ws.rs" module="jakarta.ws.rs-api" rev="2.1.5"/>
    </dependencies>
 </ivy-module>
--- a/Core/nbproject/project.properties
+++ b/Core/nbproject/project.properties
@ -18,6 +18,7 @@ file.reference.commons-lang3-3.8.1.jar=release\\modules\\ext\\commons-lang3-3.8.
 file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
 file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar
 file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar
 file.reference.decodetect-core-0.3.jar=release\\modules\\ext\\decodetect-core-0.3.jar
 file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar
 file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar
 file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar
@ -50,6 +51,7 @@ file.reference.jsoup-1.11.3.jar=release\\modules\\ext\\jsoup-1.11.3.jar
 file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar
 file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar
 file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar
 file.reference.jutf7-1.0.0.jar=release\\modules\\ext\\jutf7-1.0.0.jar
 file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar
 file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
 file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@ -794,6 +794,14 @@
                <runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
                <binary-origin>release\modules\ext\vorbis-java-tika-0.8.jar</binary-origin>
            </class-path-extension>
            <class-path-extension>
                <runtime-relative-path>ext/decodetect-core-0.3.jar</runtime-relative-path>
                <binary-origin>release/modules/ext/decodetect-core-0.3.jar</binary-origin>
            </class-path-extension>
            <class-path-extension>
                <runtime-relative-path>ext/jutf7-1.0.0.jar</runtime-relative-path>
                <binary-origin>release/modules/ext/jutf7-1.0.0.jar</binary-origin>
            </class-path-extension>
        </data>
    </configuration>
 </project>
--- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
@ -18,6 +18,7 @@
 */
 package org.sleuthkit.autopsy.modules.filetypeid;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@ -29,6 +30,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.mime.MimeTypes;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.sleuthkit.datamodel.TskCoreException;
@ -249,6 +251,17 @@ public class FileTypeDetector {
                        mimeType = tikaType.replace("tika-", ""); //NON-NLS
                        mimeType = removeOptionalParameter(mimeType);
                    }
                } else {
                    /*
                     * If the file was marked as an octet stream and the extension is .txt, try to detect a text
                     * encoding with Decodetect.
                     */
                    if (file.getNameExtension().equals("txt")) {
                        Charset detectedCharset = TextFileExtractor.getEncoding(file);
                        if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
                            mimeType = MimeTypes.PLAIN_TEXT;
                        }
                    }
                }
                /**
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@ -89,6 +89,7 @@ public class TextExtractorFactory {
     */
    private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
        List<TextExtractor> fileExtractors = Arrays.asList(
                new TextFileExtractor(content),
                new HtmlTextExtractor(content),
                new SqliteTextExtractor(content),
                new TikaTextExtractor(content));
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
@ -0,0 +1,135 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2018-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.textextractors;
 import com.ethteck.decodetect.core.Decodetect;
 import com.ethteck.decodetect.core.DecodetectResult;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.StandardCharsets;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.List;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 /**
 * Extract text from text files
 */
 public final class TextFileExtractor implements TextExtractor {
    public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
        @Override
        public boolean contains(Charset cs) {
            return false;
        }
        @Override
        public CharsetDecoder newDecoder() {
            return null;
        }
        @Override
        public CharsetEncoder newEncoder() {
            return null;
        }
    };
    // This value will be used as a threshold for determining which encoding
    // detection library to use. If Tika's own confidence is at least
    // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
    // Otherwise, Decodetect will be used.
    static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
    // This value determines whether we will consider Decodetect's top-scoring
    // result a legitimate match or if we will disregard its findings
    //
    // Possible values are 0 to 1, inclusive
    static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
    private final AbstractFile file;
    public TextFileExtractor(AbstractFile file) {
        this.file = file;
    }
    @Override
    public Reader getReader() {
        Charset encoding = getEncoding(file);
        if (encoding.equals(UNKNOWN_CHARSET)) {
            encoding = StandardCharsets.UTF_8;
        }
        return getReader(encoding);
    }
    public Reader getReader(Charset encoding) {
        return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
    }
    @Override
    public boolean isSupported() {
        return file.getMIMEType().equals("text/plain");
    }
    public class TextFileExtractorException extends Exception {
        public TextFileExtractorException(String msg, Throwable ex) {
            super(msg, ex);
        }
        public TextFileExtractorException(String msg) {
            super(msg);
        }
    }
    public static Charset getEncoding(Content content) {
        try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
            // Tika first
            CharsetDetector detector = new CharsetDetector();
            detector.setText(stream);
            CharsetMatch tikaResult = detector.detect();
            if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
                try {
                    return Charset.forName(tikaResult.getName());
                } catch (UnsupportedCharsetException ignored) {
                }
            }
            // Decodetect if Tika fails or falls below confidence threshold
            int maxBytes = 100000;
            int numBytes = Math.min(stream.available(), maxBytes);
            byte[] targetArray = new byte[numBytes];
            stream.read(targetArray);
            List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
            if (!results.isEmpty()) {
                DecodetectResult topResult = results.get(0);
                if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
                    return topResult.getEncoding();
                }
            }
        } catch (IOException ignored) {
        }
        return UNKNOWN_CHARSET;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -28,6 +28,7 @@ import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
 import org.apache.tika.mime.MimeTypes;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
@ -44,12 +45,12 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
 import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
 import org.sleuthkit.autopsy.ingest.IngestServices;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
 import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
 import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
 import org.sleuthkit.datamodel.AbstractFile;
@ -632,7 +633,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                if (context.fileIngestIsCancelled()) {
                    return;
                }
-                if (fileType.equals("application/octet-stream")) {
+                if (fileType.equals(MimeTypes.OCTET_STREAM)) {
                    extractStringsAndIndex(aFile);
                    return;
                }
@ -657,20 +658,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
                //Carved Files should be the only type of unallocated files capable of a txt extension and 
                //should be ignored by the TextFileExtractor because they may contain more than one text encoding
-                try {
+                wasTextAdded = indexTextFile(aFile);
                    TextFileExtractor textFileExtractor = new TextFileExtractor();
                    Reader textReader = textFileExtractor.getReader(aFile);
                    if (textReader == null) {
                        logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
                    } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
                        putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
                        wasTextAdded = true;
                    }
                } catch (IngesterException ex) {
                    logger.log(Level.WARNING, "Unable to index as unicode", ex);
                } catch (TextFileExtractorException ex) {
                    logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
                }
            }
            // if it wasn't supported or had an error, default to strings
@ -678,5 +666,29 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                extractStringsAndIndex(aFile);
            }
        }
        /**
         * Adds the text file to the index given an encoding.
         * Returns true if indexing was successful and false otherwise.
         *
         * @param aFile Text file to analyze
         * @param detectedCharset the encoding of the file
         */
        private boolean indexTextFile(AbstractFile aFile) {
            try {
                TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
                Reader textReader = textFileExtractor.getReader();
                if (textReader == null) {
                    logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
                } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
                    textReader.close();
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
                    return true;
                }
            } catch (IngesterException | IOException ex) {
                logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
            }
            return false;
        }
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@ -1,68 +0,0 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2018-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.BufferedInputStream;
 import java.io.Reader;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 /**
 * Extract text from .txt files
 */
 final class TextFileExtractor {
    //Set a Minimum confidence value to reject matches that may not have a valid text encoding
    //Values of valid text encodings were generally 100, xml code sometimes had a value around 50, 
    //and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
    //This limited information was used to select the current value as one that would filter out clearly non-text 
    //files while hopefully working on all files with a valid text encoding
    static final private int MIN_MATCH_CONFIDENCE = 20;
    public Reader getReader(AbstractFile source) throws TextFileExtractorException {
        CharsetDetector detector = new CharsetDetector();
        //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
        InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
        try {
            detector.setText(stream);
        } catch (IOException ex) {
            throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
        }
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
        } else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
            throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
        }
        return match.getReader();
    }
    public class TextFileExtractorException extends Exception {
        public TextFileExtractorException(String msg, Throwable ex) {
            super(msg, ex);
        }
        public TextFileExtractorException(String msg) {
            super(msg);
        }
    }
 }