mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Merge pull request #5125 from ethteck/better-encoding-detection
Added decodetect library for better encoding detection on text files
This commit is contained in:
commit
754fd8c9d0
@ -42,6 +42,9 @@
|
||||
|
||||
<dependency conf="core->default" org="com.google.cloud" name="google-cloud-translate" rev="1.70.0"/>
|
||||
<dependency conf="core->default" org="org.apache.opennlp" name="opennlp-tools" rev="1.9.1"/>
|
||||
|
||||
<dependency conf="core->default" org="com.ethteck.decodetect" name="decodetect-core" rev="0.3"/>
|
||||
<dependency conf="core->default" org="com.beetstra.jutf7" name="jutf7" rev="1.0.0"/>
|
||||
|
||||
<dependency org="org.sejda.webp-imageio" name="webp-imageio-sejda" rev="0.1.0"/>
|
||||
<dependency org="com.googlecode.libphonenumber" name="libphonenumber" rev="3.5" />
|
||||
@ -54,6 +57,6 @@
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/javax.ws.rs/javax.ws.rs-api -->
|
||||
<dependency org="javax.ws.rs" name="javax.ws.rs-api" rev="2.0"/>
|
||||
<override org="jakarta.ws.rs" module="jakarta.ws.rs-api" rev="2.1.5"/>
|
||||
<override org="jakarta.ws.rs" module="jakarta.ws.rs-api" rev="2.1.5"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
@ -18,6 +18,7 @@ file.reference.commons-lang3-3.8.1.jar=release\\modules\\ext\\commons-lang3-3.8.
|
||||
file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
|
||||
file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar
|
||||
file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar
|
||||
file.reference.decodetect-core-0.3.jar=release\\modules\\ext\\decodetect-core-0.3.jar
|
||||
file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar
|
||||
file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar
|
||||
file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar
|
||||
@ -50,6 +51,7 @@ file.reference.jsoup-1.11.3.jar=release\\modules\\ext\\jsoup-1.11.3.jar
|
||||
file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar
|
||||
file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar
|
||||
file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar
|
||||
file.reference.jutf7-1.0.0.jar=release\\modules\\ext\\jutf7-1.0.0.jar
|
||||
file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar
|
||||
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
|
||||
file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar
|
||||
|
@ -794,6 +794,14 @@
|
||||
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
|
||||
<binary-origin>release\modules\ext\vorbis-java-tika-0.8.jar</binary-origin>
|
||||
</class-path-extension>
|
||||
<class-path-extension>
|
||||
<runtime-relative-path>ext/decodetect-core-0.3.jar</runtime-relative-path>
|
||||
<binary-origin>release/modules/ext/decodetect-core-0.3.jar</binary-origin>
|
||||
</class-path-extension>
|
||||
<class-path-extension>
|
||||
<runtime-relative-path>ext/jutf7-1.0.0.jar</runtime-relative-path>
|
||||
<binary-origin>release/modules/ext/jutf7-1.0.0.jar</binary-origin>
|
||||
</class-path-extension>
|
||||
</data>
|
||||
</configuration>
|
||||
</project>
|
||||
|
@ -18,6 +18,7 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.modules.filetypeid;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
@ -29,6 +30,7 @@ import org.apache.tika.Tika;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.mime.MimeTypes;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
@ -249,6 +251,17 @@ public class FileTypeDetector {
|
||||
mimeType = tikaType.replace("tika-", ""); //NON-NLS
|
||||
mimeType = removeOptionalParameter(mimeType);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If the file was marked as an octet stream and the extension is .txt, try to detect a text
|
||||
* encoding with Decodetect.
|
||||
*/
|
||||
if (file.getNameExtension().equals("txt")) {
|
||||
Charset detectedCharset = TextFileExtractor.getEncoding(file);
|
||||
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
|
||||
mimeType = MimeTypes.PLAIN_TEXT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -89,6 +89,7 @@ public class TextExtractorFactory {
|
||||
*/
|
||||
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
|
||||
List<TextExtractor> fileExtractors = Arrays.asList(
|
||||
new TextFileExtractor(content),
|
||||
new HtmlTextExtractor(content),
|
||||
new SqliteTextExtractor(content),
|
||||
new TikaTextExtractor(content));
|
||||
|
@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2018-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import com.ethteck.decodetect.core.Decodetect;
|
||||
import com.ethteck.decodetect.core.DecodetectResult;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
import java.util.List;
|
||||
import org.apache.tika.parser.txt.CharsetDetector;
|
||||
import org.apache.tika.parser.txt.CharsetMatch;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extract text from text files
|
||||
*/
|
||||
public final class TextFileExtractor implements TextExtractor {
|
||||
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
|
||||
@Override
|
||||
public boolean contains(Charset cs) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharsetDecoder newDecoder() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharsetEncoder newEncoder() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
// This value will be used as a threshold for determining which encoding
|
||||
// detection library to use. If Tika's own confidence is at least
|
||||
// MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
|
||||
// Otherwise, Decodetect will be used.
|
||||
static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
|
||||
|
||||
// This value determines whether we will consider Decodetect's top-scoring
|
||||
// result a legitimate match or if we will disregard its findings
|
||||
//
|
||||
// Possible values are 0 to 1, inclusive
|
||||
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
|
||||
|
||||
private final AbstractFile file;
|
||||
|
||||
public TextFileExtractor(AbstractFile file) {
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() {
|
||||
Charset encoding = getEncoding(file);
|
||||
if (encoding.equals(UNKNOWN_CHARSET)) {
|
||||
encoding = StandardCharsets.UTF_8;
|
||||
}
|
||||
return getReader(encoding);
|
||||
}
|
||||
|
||||
public Reader getReader(Charset encoding) {
|
||||
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported() {
|
||||
return file.getMIMEType().equals("text/plain");
|
||||
}
|
||||
|
||||
public class TextFileExtractorException extends Exception {
|
||||
public TextFileExtractorException(String msg, Throwable ex) {
|
||||
super(msg, ex);
|
||||
}
|
||||
public TextFileExtractorException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
}
|
||||
|
||||
public static Charset getEncoding(Content content) {
|
||||
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
|
||||
// Tika first
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(stream);
|
||||
CharsetMatch tikaResult = detector.detect();
|
||||
if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
|
||||
try {
|
||||
return Charset.forName(tikaResult.getName());
|
||||
} catch (UnsupportedCharsetException ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
// Decodetect if Tika fails or falls below confidence threshold
|
||||
int maxBytes = 100000;
|
||||
int numBytes = Math.min(stream.available(), maxBytes);
|
||||
byte[] targetArray = new byte[numBytes];
|
||||
stream.read(targetArray);
|
||||
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
|
||||
if (!results.isEmpty()) {
|
||||
DecodetectResult topResult = results.get(0);
|
||||
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
|
||||
return topResult.getEncoding();
|
||||
}
|
||||
}
|
||||
} catch (IOException ignored) {
|
||||
}
|
||||
return UNKNOWN_CHARSET;
|
||||
}
|
||||
}
|
@ -28,6 +28,7 @@ import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.logging.Level;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.tika.mime.MimeTypes;
|
||||
import org.openide.util.Lookup;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.NbBundle.Messages;
|
||||
@ -44,12 +45,12 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
|
||||
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
||||
import org.sleuthkit.autopsy.ingest.IngestServices;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||
import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
|
||||
import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
|
||||
import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
@ -632,7 +633,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return;
|
||||
}
|
||||
if (fileType.equals("application/octet-stream")) {
|
||||
if (fileType.equals(MimeTypes.OCTET_STREAM)) {
|
||||
extractStringsAndIndex(aFile);
|
||||
return;
|
||||
}
|
||||
@ -657,20 +658,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
|
||||
//Carved Files should be the only type of unallocated files capable of a txt extension and
|
||||
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
||||
try {
|
||||
TextFileExtractor textFileExtractor = new TextFileExtractor();
|
||||
Reader textReader = textFileExtractor.getReader(aFile);
|
||||
if (textReader == null) {
|
||||
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
|
||||
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
||||
wasTextAdded = true;
|
||||
}
|
||||
} catch (IngesterException ex) {
|
||||
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
||||
} catch (TextFileExtractorException ex) {
|
||||
logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
|
||||
}
|
||||
wasTextAdded = indexTextFile(aFile);
|
||||
}
|
||||
|
||||
// if it wasn't supported or had an error, default to strings
|
||||
@ -678,5 +666,29 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
extractStringsAndIndex(aFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the text file to the index given an encoding.
|
||||
* Returns true if indexing was successful and false otherwise.
|
||||
*
|
||||
* @param aFile Text file to analyze
|
||||
* @param detectedCharset the encoding of the file
|
||||
*/
|
||||
private boolean indexTextFile(AbstractFile aFile) {
|
||||
try {
|
||||
TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
|
||||
Reader textReader = textFileExtractor.getReader();
|
||||
if (textReader == null) {
|
||||
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
|
||||
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||
textReader.close();
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
||||
return true;
|
||||
}
|
||||
} catch (IngesterException | IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,68 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2018-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.Reader;
|
||||
import org.apache.tika.parser.txt.CharsetDetector;
|
||||
import org.apache.tika.parser.txt.CharsetMatch;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extract text from .txt files
|
||||
*/
|
||||
final class TextFileExtractor {
|
||||
|
||||
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
|
||||
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
|
||||
//and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
|
||||
//This limited information was used to select the current value as one that would filter out clearly non-text
|
||||
//files while hopefully working on all files with a valid text encoding
|
||||
static final private int MIN_MATCH_CONFIDENCE = 20;
|
||||
|
||||
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
|
||||
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
|
||||
try {
|
||||
detector.setText(stream);
|
||||
} catch (IOException ex) {
|
||||
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
||||
}
|
||||
CharsetMatch match = detector.detect();
|
||||
if (match == null) {
|
||||
throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
|
||||
} else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
||||
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
||||
}
|
||||
|
||||
return match.getReader();
|
||||
}
|
||||
|
||||
public class TextFileExtractorException extends Exception {
|
||||
public TextFileExtractorException(String msg, Throwable ex) {
|
||||
super(msg, ex);
|
||||
}
|
||||
public TextFileExtractorException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user