diff --git a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java index 46b60d9b1e..477788df21 100644 --- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java +++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java @@ -254,10 +254,10 @@ public class FileTypeDetector { } else { /* * If the file was marked as an octet stream and the extension is .txt, try to detect a text - * encoding with Decodetect. + * encoding */ if (file.getNameExtension().equals("txt")) { - Charset detectedCharset = TextFileExtractor.getEncoding(file); + Charset detectedCharset = new TextFileExtractor(file).getEncoding(); if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) { mimeType = MimeTypes.PLAIN_TEXT; } diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java index ff0ba51dd1..9bc5af74bd 100755 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java @@ -85,14 +85,14 @@ public class TextExtractorFactory { * @param content AbstractFile content * @param context Lookup containing extractor configurations * - * @return + * @return List of all extractors in priority order. Not all will support the passed in content. @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED */ private static List getFileExtractors(AbstractFile content, Lookup context) { List fileExtractors = Arrays.asList( new TextFileExtractor(content), new HtmlTextExtractor(content), new SqliteTextExtractor(content), - new TikaTextExtractor(content)); + new TikaTextExtractor(content)); /// This should go last to ensure the more specific ones are picked first. fileExtractors.forEach((fileExtractor) -> { fileExtractor.setExtractionSettings(context); diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java index 4577049b33..e6c52fb19c 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java @@ -31,17 +31,24 @@ import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; import java.nio.charset.UnsupportedCharsetException; import java.util.List; +import java.util.logging.Level; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; +import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; +import org.sleuthkit.datamodel.TskCoreException; /** - * Extract text from text files + * A TextExtractor that is used to extract text from a text file. */ public final class TextFileExtractor implements TextExtractor { - public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) { + + /* + * The char set returned if a text file extractor fails to detect the + * encoding of the file from which it is extracting text. + */ + public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) { @Override public boolean contains(Charset cs) { return false; @@ -59,33 +66,45 @@ public final class TextFileExtractor implements TextExtractor { }; // This value will be used as a threshold for determining which encoding - // detection library to use. If Tika's own confidence is at least - // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding. + // detection library to use. If CharsetDetector's own confidence is at least + // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding. // Otherwise, Decodetect will be used. - static final private int MIN_TIKA_MATCH_CONFIDENCE = 35; + // + // Note: We initially used a confidence of 35, but it was causing some + // Chrome Cache files to get flagged as UTF-16 with confidence 40. + // These files had a small amount of binary data and then ASCII. + static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41; // This value determines whether we will consider Decodetect's top-scoring - // result a legitimate match or if we will disregard its findings + // result a legitimate match or if we will disregard its findings. // - // Possible values are 0 to 1, inclusive + // Possible values are 0 to 1, inclusive. static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4; + private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private final AbstractFile file; + private Charset encoding = null; + + /** + * Constructs a TextExtractor that is used to extract text from a text file. + * + * @param file The file. + */ public TextFileExtractor(AbstractFile file) { this.file = file; } @Override public Reader getReader() { - Charset encoding = getEncoding(file); - if (encoding.equals(UNKNOWN_CHARSET)) { - encoding = StandardCharsets.UTF_8; + Charset enc = getEncoding(); + if (enc.equals(UNKNOWN_CHARSET)) { + enc = StandardCharsets.UTF_8; } - return getReader(encoding); + return getReader(enc); } - public Reader getReader(Charset encoding) { + private Reader getReader(Charset encoding) { return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding); } @@ -94,33 +113,60 @@ public final class TextFileExtractor implements TextExtractor { return file.getMIMEType().equals("text/plain"); } - public static Charset getEncoding(Content content) { - try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) { - // Tika first + /** + * Returns the encoding of the file. + * + * @return Detected encoding or UNKNOWN_CHARSET. + */ + public Charset getEncoding() { + if (encoding != null) { + return encoding; + } + + // Encoding detection is hard. We use several libraries since the data passed in is often messy. + // First try CharsetDetector (from Tika / ICU4J). + // It is a rule-based detection approach. + try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) { CharsetDetector detector = new CharsetDetector(); detector.setText(stream); CharsetMatch tikaResult = detector.detect(); - if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) { + if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) { try { - return Charset.forName(tikaResult.getName()); - } catch (UnsupportedCharsetException ignored) { + encoding = Charset.forName(tikaResult.getName()); + return encoding; + } catch (UnsupportedCharsetException ex) { + logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex); } } + } catch (IOException ex) { + logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex); + } - // Decodetect if Tika fails or falls below confidence threshold + // If that did not work, then use DecoDetect, which is stastical + // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence) + // This will not always work with messy data that combines some binary and some ASCII. + try { int maxBytes = 100000; - int numBytes = Math.min(stream.available(), maxBytes); + int numBytes = maxBytes; + if (file.getSize() < maxBytes) { + numBytes = (int) file.getSize(); + } + byte[] targetArray = new byte[numBytes]; - stream.read(targetArray); + file.read(targetArray, 0, numBytes); List results = Decodetect.DECODETECT.getResults(targetArray); if (!results.isEmpty()) { DecodetectResult topResult = results.get(0); if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) { - return topResult.getEncoding(); + encoding = topResult.getEncoding(); + return encoding; } } - } catch (IOException ignored) { + } catch (TskCoreException ex) { + logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex); } - return UNKNOWN_CHARSET; + + encoding = UNKNOWN_CHARSET; + return encoding; } }