Merge pull request #5125 from ethteck/better-encoding-detection

Added decodetect library for better encoding detection on text files
This commit is contained in:
Richard Cordovano 2019-12-06 18:42:56 -05:00 committed by GitHub
commit 754fd8c9d0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 191 additions and 85 deletions

View File

@ -43,6 +43,9 @@
<dependency conf="core->default" org="com.google.cloud" name="google-cloud-translate" rev="1.70.0"/>
<dependency conf="core->default" org="org.apache.opennlp" name="opennlp-tools" rev="1.9.1"/>
<dependency conf="core->default" org="com.ethteck.decodetect" name="decodetect-core" rev="0.3"/>
<dependency conf="core->default" org="com.beetstra.jutf7" name="jutf7" rev="1.0.0"/>
<dependency org="org.sejda.webp-imageio" name="webp-imageio-sejda" rev="0.1.0"/>
<dependency org="com.googlecode.libphonenumber" name="libphonenumber" rev="3.5" />
<dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/>

View File

@ -18,6 +18,7 @@ file.reference.commons-lang3-3.8.1.jar=release\\modules\\ext\\commons-lang3-3.8.
file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar
file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar
file.reference.decodetect-core-0.3.jar=release\\modules\\ext\\decodetect-core-0.3.jar
file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar
file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar
file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar
@ -50,6 +51,7 @@ file.reference.jsoup-1.11.3.jar=release\\modules\\ext\\jsoup-1.11.3.jar
file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar
file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar
file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar
file.reference.jutf7-1.0.0.jar=release\\modules\\ext\\jutf7-1.0.0.jar
file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar

View File

@ -794,6 +794,14 @@
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
<binary-origin>release\modules\ext\vorbis-java-tika-0.8.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/decodetect-core-0.3.jar</runtime-relative-path>
<binary-origin>release/modules/ext/decodetect-core-0.3.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jutf7-1.0.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jutf7-1.0.0.jar</binary-origin>
</class-path-extension>
</data>
</configuration>
</project>

View File

@ -18,6 +18,7 @@
*/
package org.sleuthkit.autopsy.modules.filetypeid;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@ -29,6 +30,7 @@ import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MimeTypes;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.TskCoreException;
@ -249,6 +251,17 @@ public class FileTypeDetector {
mimeType = tikaType.replace("tika-", ""); //NON-NLS
mimeType = removeOptionalParameter(mimeType);
}
} else {
/*
* If the file was marked as an octet stream and the extension is .txt, try to detect a text
* encoding with Decodetect.
*/
if (file.getNameExtension().equals("txt")) {
Charset detectedCharset = TextFileExtractor.getEncoding(file);
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
mimeType = MimeTypes.PLAIN_TEXT;
}
}
}
/**

View File

@ -89,6 +89,7 @@ public class TextExtractorFactory {
*/
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
List<TextExtractor> fileExtractors = Arrays.asList(
new TextFileExtractor(content),
new HtmlTextExtractor(content),
new SqliteTextExtractor(content),
new TikaTextExtractor(content));

View File

@ -0,0 +1,135 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2018-2019 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors;
import com.ethteck.decodetect.core.Decodetect;
import com.ethteck.decodetect.core.DecodetectResult;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.List;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extract text from text files
*/
public final class TextFileExtractor implements TextExtractor {
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
@Override
public boolean contains(Charset cs) {
return false;
}
@Override
public CharsetDecoder newDecoder() {
return null;
}
@Override
public CharsetEncoder newEncoder() {
return null;
}
};
// This value will be used as a threshold for determining which encoding
// detection library to use. If Tika's own confidence is at least
// MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
// Otherwise, Decodetect will be used.
static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
// This value determines whether we will consider Decodetect's top-scoring
// result a legitimate match or if we will disregard its findings
//
// Possible values are 0 to 1, inclusive
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
private final AbstractFile file;
public TextFileExtractor(AbstractFile file) {
this.file = file;
}
@Override
public Reader getReader() {
Charset encoding = getEncoding(file);
if (encoding.equals(UNKNOWN_CHARSET)) {
encoding = StandardCharsets.UTF_8;
}
return getReader(encoding);
}
public Reader getReader(Charset encoding) {
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
}
@Override
public boolean isSupported() {
return file.getMIMEType().equals("text/plain");
}
public class TextFileExtractorException extends Exception {
public TextFileExtractorException(String msg, Throwable ex) {
super(msg, ex);
}
public TextFileExtractorException(String msg) {
super(msg);
}
}
public static Charset getEncoding(Content content) {
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
// Tika first
CharsetDetector detector = new CharsetDetector();
detector.setText(stream);
CharsetMatch tikaResult = detector.detect();
if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
try {
return Charset.forName(tikaResult.getName());
} catch (UnsupportedCharsetException ignored) {
}
}
// Decodetect if Tika fails or falls below confidence threshold
int maxBytes = 100000;
int numBytes = Math.min(stream.available(), maxBytes);
byte[] targetArray = new byte[numBytes];
stream.read(targetArray);
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
if (!results.isEmpty()) {
DecodetectResult topResult = results.get(0);
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
return topResult.getEncoding();
}
}
} catch (IOException ignored) {
}
return UNKNOWN_CHARSET;
}
}

View File

@ -28,6 +28,7 @@ import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.stream.Collectors;
import org.apache.tika.mime.MimeTypes;
import org.openide.util.Lookup;
import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages;
@ -44,12 +45,12 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
import org.sleuthkit.datamodel.AbstractFile;
@ -632,7 +633,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) {
return;
}
if (fileType.equals("application/octet-stream")) {
if (fileType.equals(MimeTypes.OCTET_STREAM)) {
extractStringsAndIndex(aFile);
return;
}
@ -657,20 +658,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
//Carved Files should be the only type of unallocated files capable of a txt extension and
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
try {
TextFileExtractor textFileExtractor = new TextFileExtractor();
Reader textReader = textFileExtractor.getReader(aFile);
if (textReader == null) {
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
wasTextAdded = true;
}
} catch (IngesterException ex) {
logger.log(Level.WARNING, "Unable to index as unicode", ex);
} catch (TextFileExtractorException ex) {
logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
}
wasTextAdded = indexTextFile(aFile);
}
// if it wasn't supported or had an error, default to strings
@ -678,5 +666,29 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
extractStringsAndIndex(aFile);
}
}
/**
* Adds the text file to the index given an encoding.
* Returns true if indexing was successful and false otherwise.
*
* @param aFile Text file to analyze
* @param detectedCharset the encoding of the file
*/
private boolean indexTextFile(AbstractFile aFile) {
try {
TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
Reader textReader = textFileExtractor.getReader();
if (textReader == null) {
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
textReader.close();
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
return true;
}
} catch (IngesterException | IOException ex) {
logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
}
return false;
}
}
}

View File

@ -1,68 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2018-2019 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.Reader;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extract text from .txt files
*/
final class TextFileExtractor {
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
//and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
//This limited information was used to select the current value as one that would filter out clearly non-text
//files while hopefully working on all files with a valid text encoding
static final private int MIN_MATCH_CONFIDENCE = 20;
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
CharsetDetector detector = new CharsetDetector();
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
try {
detector.setText(stream);
} catch (IOException ex) {
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
}
CharsetMatch match = detector.detect();
if (match == null) {
throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
} else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
}
return match.getReader();
}
public class TextFileExtractorException extends Exception {
public TextFileExtractorException(String msg, Throwable ex) {
super(msg, ex);
}
public TextFileExtractorException(String msg) {
super(msg);
}
}
}