Merge pull request #5536 from sleuthkit/release-4.14.0

Merge release-4.14.0 branch onto develop branch
This commit is contained in:
Richard Cordovano 2019-12-23 12:19:07 -05:00 committed by GitHub
commit 335b0791cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 75 additions and 29 deletions

View File

@ -254,10 +254,10 @@ public class FileTypeDetector {
} else {
/*
* If the file was marked as an octet stream and the extension is .txt, try to detect a text
* encoding with Decodetect.
* encoding
*/
if (file.getNameExtension().equals("txt")) {
Charset detectedCharset = TextFileExtractor.getEncoding(file);
Charset detectedCharset = new TextFileExtractor(file).getEncoding();
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
mimeType = MimeTypes.PLAIN_TEXT;
}

View File

@ -85,14 +85,14 @@ public class TextExtractorFactory {
* @param content AbstractFile content
* @param context Lookup containing extractor configurations
*
* @return
* @return List of all extractors in priority order. Not all will support the passed in content. @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED
*/
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
List<TextExtractor> fileExtractors = Arrays.asList(
new TextFileExtractor(content),
new HtmlTextExtractor(content),
new SqliteTextExtractor(content),
new TikaTextExtractor(content));
new TikaTextExtractor(content)); /// This should go last to ensure the more specific ones are picked first.
fileExtractors.forEach((fileExtractor) -> {
fileExtractor.setExtractionSettings(context);

View File

@ -31,17 +31,24 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.List;
import java.util.logging.Level;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Extract text from text files
* A TextExtractor that is used to extract text from a text file.
*/
public final class TextFileExtractor implements TextExtractor {
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
/*
* The char set returned if a text file extractor fails to detect the
* encoding of the file from which it is extracting text.
*/
public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
@Override
public boolean contains(Charset cs) {
return false;
@ -59,33 +66,45 @@ public final class TextFileExtractor implements TextExtractor {
};
// This value will be used as a threshold for determining which encoding
// detection library to use. If Tika's own confidence is at least
// MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
// detection library to use. If CharsetDetector's own confidence is at least
// MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
// Otherwise, Decodetect will be used.
static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
//
// Note: We initially used a confidence of 35, but it was causing some
// Chrome Cache files to get flagged as UTF-16 with confidence 40.
// These files had a small amount of binary data and then ASCII.
static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
// This value determines whether we will consider Decodetect's top-scoring
// result a legitimate match or if we will disregard its findings
// result a legitimate match or if we will disregard its findings.
//
// Possible values are 0 to 1, inclusive
// Possible values are 0 to 1, inclusive.
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
private final AbstractFile file;
private Charset encoding = null;
/**
* Constructs a TextExtractor that is used to extract text from a text file.
*
* @param file The file.
*/
public TextFileExtractor(AbstractFile file) {
this.file = file;
}
@Override
public Reader getReader() {
Charset encoding = getEncoding(file);
if (encoding.equals(UNKNOWN_CHARSET)) {
encoding = StandardCharsets.UTF_8;
Charset enc = getEncoding();
if (enc.equals(UNKNOWN_CHARSET)) {
enc = StandardCharsets.UTF_8;
}
return getReader(encoding);
return getReader(enc);
}
public Reader getReader(Charset encoding) {
private Reader getReader(Charset encoding) {
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
}
@ -94,33 +113,60 @@ public final class TextFileExtractor implements TextExtractor {
return file.getMIMEType().equals("text/plain");
}
public static Charset getEncoding(Content content) {
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
// Tika first
/**
* Returns the encoding of the file.
*
* @return Detected encoding or UNKNOWN_CHARSET.
*/
public Charset getEncoding() {
if (encoding != null) {
return encoding;
}
// Encoding detection is hard. We use several libraries since the data passed in is often messy.
// First try CharsetDetector (from Tika / ICU4J).
// It is a rule-based detection approach.
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
CharsetDetector detector = new CharsetDetector();
detector.setText(stream);
CharsetMatch tikaResult = detector.detect();
if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
try {
return Charset.forName(tikaResult.getName());
} catch (UnsupportedCharsetException ignored) {
encoding = Charset.forName(tikaResult.getName());
return encoding;
} catch (UnsupportedCharsetException ex) {
logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex);
}
}
} catch (IOException ex) {
logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex);
}
// Decodetect if Tika fails or falls below confidence threshold
// If that did not work, then use DecoDetect, which is stastical
// We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
// This will not always work with messy data that combines some binary and some ASCII.
try {
int maxBytes = 100000;
int numBytes = Math.min(stream.available(), maxBytes);
int numBytes = maxBytes;
if (file.getSize() < maxBytes) {
numBytes = (int) file.getSize();
}
byte[] targetArray = new byte[numBytes];
stream.read(targetArray);
file.read(targetArray, 0, numBytes);
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
if (!results.isEmpty()) {
DecodetectResult topResult = results.get(0);
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
return topResult.getEncoding();
encoding = topResult.getEncoding();
return encoding;
}
}
} catch (IOException ignored) {
} catch (TskCoreException ex) {
logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex);
}
return UNKNOWN_CHARSET;
encoding = UNKNOWN_CHARSET;
return encoding;
}
}