Merge pull request #5536 from sleuthkit/release-4.14.0

Merge release-4.14.0 branch onto develop branch
This commit is contained in:
Richard Cordovano 2019-12-23 12:19:07 -05:00 committed by GitHub
commit 335b0791cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 75 additions and 29 deletions

View File

@ -254,10 +254,10 @@ public class FileTypeDetector {
} else { } else {
/* /*
* If the file was marked as an octet stream and the extension is .txt, try to detect a text * If the file was marked as an octet stream and the extension is .txt, try to detect a text
* encoding with Decodetect. * encoding
*/ */
if (file.getNameExtension().equals("txt")) { if (file.getNameExtension().equals("txt")) {
Charset detectedCharset = TextFileExtractor.getEncoding(file); Charset detectedCharset = new TextFileExtractor(file).getEncoding();
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) { if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
mimeType = MimeTypes.PLAIN_TEXT; mimeType = MimeTypes.PLAIN_TEXT;
} }

View File

@ -85,14 +85,14 @@ public class TextExtractorFactory {
* @param content AbstractFile content * @param content AbstractFile content
* @param context Lookup containing extractor configurations * @param context Lookup containing extractor configurations
* *
* @return * @return List of all extractors in priority order. Not all will support the passed in content. @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED
*/ */
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) { private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
List<TextExtractor> fileExtractors = Arrays.asList( List<TextExtractor> fileExtractors = Arrays.asList(
new TextFileExtractor(content), new TextFileExtractor(content),
new HtmlTextExtractor(content), new HtmlTextExtractor(content),
new SqliteTextExtractor(content), new SqliteTextExtractor(content),
new TikaTextExtractor(content)); new TikaTextExtractor(content)); /// This should go last to ensure the more specific ones are picked first.
fileExtractors.forEach((fileExtractor) -> { fileExtractors.forEach((fileExtractor) -> {
fileExtractor.setExtractionSettings(context); fileExtractor.setExtractionSettings(context);

View File

@ -31,17 +31,24 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException; import java.nio.charset.UnsupportedCharsetException;
import java.util.List; import java.util.List;
import java.util.logging.Level;
import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.TskCoreException;
/** /**
* Extract text from text files * A TextExtractor that is used to extract text from a text file.
*/ */
public final class TextFileExtractor implements TextExtractor { public final class TextFileExtractor implements TextExtractor {
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
/*
* The char set returned if a text file extractor fails to detect the
* encoding of the file from which it is extracting text.
*/
public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
@Override @Override
public boolean contains(Charset cs) { public boolean contains(Charset cs) {
return false; return false;
@ -59,33 +66,45 @@ public final class TextFileExtractor implements TextExtractor {
}; };
// This value will be used as a threshold for determining which encoding // This value will be used as a threshold for determining which encoding
// detection library to use. If Tika's own confidence is at least // detection library to use. If CharsetDetector's own confidence is at least
// MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding. // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
// Otherwise, Decodetect will be used. // Otherwise, Decodetect will be used.
static final private int MIN_TIKA_MATCH_CONFIDENCE = 35; //
// Note: We initially used a confidence of 35, but it was causing some
// Chrome Cache files to get flagged as UTF-16 with confidence 40.
// These files had a small amount of binary data and then ASCII.
static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
// This value determines whether we will consider Decodetect's top-scoring // This value determines whether we will consider Decodetect's top-scoring
// result a legitimate match or if we will disregard its findings // result a legitimate match or if we will disregard its findings.
// //
// Possible values are 0 to 1, inclusive // Possible values are 0 to 1, inclusive.
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4; static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
private final AbstractFile file; private final AbstractFile file;
private Charset encoding = null;
/**
* Constructs a TextExtractor that is used to extract text from a text file.
*
* @param file The file.
*/
public TextFileExtractor(AbstractFile file) { public TextFileExtractor(AbstractFile file) {
this.file = file; this.file = file;
} }
@Override @Override
public Reader getReader() { public Reader getReader() {
Charset encoding = getEncoding(file); Charset enc = getEncoding();
if (encoding.equals(UNKNOWN_CHARSET)) { if (enc.equals(UNKNOWN_CHARSET)) {
encoding = StandardCharsets.UTF_8; enc = StandardCharsets.UTF_8;
} }
return getReader(encoding); return getReader(enc);
} }
public Reader getReader(Charset encoding) { private Reader getReader(Charset encoding) {
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding); return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
} }
@ -94,33 +113,60 @@ public final class TextFileExtractor implements TextExtractor {
return file.getMIMEType().equals("text/plain"); return file.getMIMEType().equals("text/plain");
} }
public static Charset getEncoding(Content content) { /**
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) { * Returns the encoding of the file.
// Tika first *
* @return Detected encoding or UNKNOWN_CHARSET.
*/
public Charset getEncoding() {
if (encoding != null) {
return encoding;
}
// Encoding detection is hard. We use several libraries since the data passed in is often messy.
// First try CharsetDetector (from Tika / ICU4J).
// It is a rule-based detection approach.
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
CharsetDetector detector = new CharsetDetector(); CharsetDetector detector = new CharsetDetector();
detector.setText(stream); detector.setText(stream);
CharsetMatch tikaResult = detector.detect(); CharsetMatch tikaResult = detector.detect();
if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) { if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
try { try {
return Charset.forName(tikaResult.getName()); encoding = Charset.forName(tikaResult.getName());
} catch (UnsupportedCharsetException ignored) { return encoding;
} catch (UnsupportedCharsetException ex) {
logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex);
} }
} }
} catch (IOException ex) {
logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex);
}
// Decodetect if Tika fails or falls below confidence threshold // If that did not work, then use DecoDetect, which is stastical
// We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
// This will not always work with messy data that combines some binary and some ASCII.
try {
int maxBytes = 100000; int maxBytes = 100000;
int numBytes = Math.min(stream.available(), maxBytes); int numBytes = maxBytes;
if (file.getSize() < maxBytes) {
numBytes = (int) file.getSize();
}
byte[] targetArray = new byte[numBytes]; byte[] targetArray = new byte[numBytes];
stream.read(targetArray); file.read(targetArray, 0, numBytes);
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray); List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
if (!results.isEmpty()) { if (!results.isEmpty()) {
DecodetectResult topResult = results.get(0); DecodetectResult topResult = results.get(0);
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) { if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
return topResult.getEncoding(); encoding = topResult.getEncoding();
return encoding;
} }
} }
} catch (IOException ignored) { } catch (TskCoreException ex) {
logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex);
} }
return UNKNOWN_CHARSET;
encoding = UNKNOWN_CHARSET;
return encoding;
} }
} }