From 684aeaffa4b1bea21b1c8bbbb182922639c44961 Mon Sep 17 00:00:00 2001 From: Brian Carrier Date: Tue, 17 Dec 2019 17:03:40 -0500 Subject: [PATCH 1/4] Added comment --- .../autopsy/textextractors/TextExtractorFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java index ff0ba51dd1..9bc5af74bd 100755 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java @@ -85,14 +85,14 @@ public class TextExtractorFactory { * @param content AbstractFile content * @param context Lookup containing extractor configurations * - * @return + * @return List of all extractors in priority order. Not all will support the passed in content. @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED */ private static List getFileExtractors(AbstractFile content, Lookup context) { List fileExtractors = Arrays.asList( new TextFileExtractor(content), new HtmlTextExtractor(content), new SqliteTextExtractor(content), - new TikaTextExtractor(content)); + new TikaTextExtractor(content)); /// This should go last to ensure the more specific ones are picked first. fileExtractors.forEach((fileExtractor) -> { fileExtractor.setExtractionSettings(context); From 69e8783869e41f90abd9cfe74a9a29feb5b93494 Mon Sep 17 00:00:00 2001 From: Brian Carrier Date: Thu, 19 Dec 2019 17:15:21 -0500 Subject: [PATCH 2/4] change encoding threshold to better deal with binary/ASCII cache files - JIRA 5894. Refactor --- .../modules/filetypeid/FileTypeDetector.java | 4 +- .../textextractors/TextFileExtractor.java | 74 ++++++++++++++----- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java index 46b60d9b1e..477788df21 100644 --- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java +++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java @@ -254,10 +254,10 @@ public class FileTypeDetector { } else { /* * If the file was marked as an octet stream and the extension is .txt, try to detect a text - * encoding with Decodetect. + * encoding */ if (file.getNameExtension().equals("txt")) { - Charset detectedCharset = TextFileExtractor.getEncoding(file); + Charset detectedCharset = new TextFileExtractor(file).getEncoding(); if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) { mimeType = MimeTypes.PLAIN_TEXT; } diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java index 3efb6b1aed..1656112c68 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java @@ -31,11 +31,14 @@ import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; import java.nio.charset.UnsupportedCharsetException; import java.util.List; +import java.util.logging.Level; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; +import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; +import org.sleuthkit.datamodel.TskCoreException; /** * Extract text from text files @@ -59,10 +62,12 @@ public final class TextFileExtractor implements TextExtractor { }; // This value will be used as a threshold for determining which encoding - // detection library to use. If Tika's own confidence is at least - // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding. + // detection library to use. If CharsetDetector's own confidence is at least + // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding. // Otherwise, Decodetect will be used. - static final private int MIN_TIKA_MATCH_CONFIDENCE = 35; + // - We had 35, but it was causing some Chrome Cache files to get flagged as UTF-16 with confidence 40. + // They had a small amount of binary data and then ASCII. + static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41; // This value determines whether we will consider Decodetect's top-scoring // result a legitimate match or if we will disregard its findings @@ -70,7 +75,10 @@ public final class TextFileExtractor implements TextExtractor { // Possible values are 0 to 1, inclusive static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4; + private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private final AbstractFile file; + + private Charset encoding = null; public TextFileExtractor(AbstractFile file) { this.file = file; @@ -78,14 +86,14 @@ public final class TextFileExtractor implements TextExtractor { @Override public Reader getReader() { - Charset encoding = getEncoding(file); - if (encoding.equals(UNKNOWN_CHARSET)) { - encoding = StandardCharsets.UTF_8; + Charset enc = getEncoding(); + if (enc.equals(UNKNOWN_CHARSET)) { + enc = StandardCharsets.UTF_8; } - return getReader(encoding); + return getReader(enc); } - public Reader getReader(Charset encoding) { + private Reader getReader(Charset encoding) { return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding); } @@ -103,33 +111,59 @@ public final class TextFileExtractor implements TextExtractor { } } - public static Charset getEncoding(Content content) { - try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) { - // Tika first + /** + * Return the encoding of the file + * @return Detected encoding or UNKNOWN_CHARSET + */ + public Charset getEncoding() { + if (encoding != null) + return encoding; + + // Encoding detection is hard. We use several libraries since the data passed in is often messy. + + // First try CharsetDetector (from Tika / ICU4J) + // It is a rule-baesd detection approach + try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) { CharsetDetector detector = new CharsetDetector(); detector.setText(stream); CharsetMatch tikaResult = detector.detect(); - if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) { + if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) { try { - return Charset.forName(tikaResult.getName()); - } catch (UnsupportedCharsetException ignored) { + encoding = Charset.forName(tikaResult.getName()); + return encoding; + } catch (UnsupportedCharsetException ex) { + logger.log(Level.WARNING, "Error converting CharsetDetector Result", ex); } } + } catch (IOException ignored) { + // IGNORE READ ERRORS HERE - Assume they were logged elsewhere + } - // Decodetect if Tika fails or falls below confidence threshold + // If that did not work, then use DecoDetect, which is stastical + // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence) + // This will not always work with messy data that combines some binary and some ASCII. + try { int maxBytes = 100000; - int numBytes = Math.min(stream.available(), maxBytes); + int numBytes = maxBytes; + if (file.getSize() < maxBytes) { + numBytes = (int) file.getSize(); + } + byte[] targetArray = new byte[numBytes]; - stream.read(targetArray); + file.read(targetArray, 0, numBytes); List results = Decodetect.DECODETECT.getResults(targetArray); if (!results.isEmpty()) { DecodetectResult topResult = results.get(0); if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) { - return topResult.getEncoding(); + encoding = topResult.getEncoding(); + return encoding; } } - } catch (IOException ignored) { + } catch (TskCoreException ex) { + // IGNORE READ ERRORS HERE - Assume they were logged elsewhere } - return UNKNOWN_CHARSET; + + encoding = UNKNOWN_CHARSET; + return encoding; } } From f9445ff7d871eee4f5593f0587e05a09df400db6 Mon Sep 17 00:00:00 2001 From: Richard Cordovano Date: Mon, 23 Dec 2019 12:08:18 -0500 Subject: [PATCH 3/4] Clean up in TextFileExtractor --- .../textextractors/TextFileExtractor.java | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java index 9ad5098524..aa99672b1d 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java @@ -36,15 +36,19 @@ import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.TskCoreException; /** - * Extract text from text files + * A TextExtractor that is used to extract text from a text file. */ public final class TextFileExtractor implements TextExtractor { - public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) { + + /* + * The char set returned if a text file extractor fails to detect the + * encoding of the file from which it is extracting text. + */ + public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) { @Override public boolean contains(Charset cs) { return false; @@ -65,21 +69,27 @@ public final class TextFileExtractor implements TextExtractor { // detection library to use. If CharsetDetector's own confidence is at least // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding. // Otherwise, Decodetect will be used. - // - We had 35, but it was causing some Chrome Cache files to get flagged as UTF-16 with confidence 40. - // They had a small amount of binary data and then ASCII. + // NOte: We initially used a confidence of 35, but it was causing some + // Chrome Cache files to get flagged as UTF-16 with confidence 40. + // These files had a small amount of binary data and then ASCII. static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41; // This value determines whether we will consider Decodetect's top-scoring - // result a legitimate match or if we will disregard its findings + // result a legitimate match or if we will disregard its findings. // - // Possible values are 0 to 1, inclusive + // Possible values are 0 to 1, inclusive. static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4; private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private final AbstractFile file; - + private Charset encoding = null; + /** + * Constructs a TextExtractor that is used to extract text from a text file. + * + * @param file The file. + */ public TextFileExtractor(AbstractFile file) { this.file = file; } @@ -103,17 +113,18 @@ public final class TextFileExtractor implements TextExtractor { } /** - * Return the encoding of the file - * @return Detected encoding or UNKNOWN_CHARSET + * Returns the encoding of the file. + * + * @return Detected encoding or UNKNOWN_CHARSET. */ public Charset getEncoding() { - if (encoding != null) + if (encoding != null) { return encoding; - + } + // Encoding detection is hard. We use several libraries since the data passed in is often messy. - - // First try CharsetDetector (from Tika / ICU4J) - // It is a rule-baesd detection approach + // First try CharsetDetector (from Tika / ICU4J). + // It is a rule-baesd detection approach. try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) { CharsetDetector detector = new CharsetDetector(); detector.setText(stream); @@ -123,11 +134,11 @@ public final class TextFileExtractor implements TextExtractor { encoding = Charset.forName(tikaResult.getName()); return encoding; } catch (UnsupportedCharsetException ex) { - logger.log(Level.WARNING, "Error converting CharsetDetector Result", ex); + logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex); } } - } catch (IOException ignored) { - // IGNORE READ ERRORS HERE - Assume they were logged elsewhere + } catch (IOException ex) { + logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex); } // If that did not work, then use DecoDetect, which is stastical @@ -139,7 +150,7 @@ public final class TextFileExtractor implements TextExtractor { if (file.getSize() < maxBytes) { numBytes = (int) file.getSize(); } - + byte[] targetArray = new byte[numBytes]; file.read(targetArray, 0, numBytes); List results = Decodetect.DECODETECT.getResults(targetArray); @@ -151,9 +162,9 @@ public final class TextFileExtractor implements TextExtractor { } } } catch (TskCoreException ex) { - // IGNORE READ ERRORS HERE - Assume they were logged elsewhere + logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex); } - + encoding = UNKNOWN_CHARSET; return encoding; } From ad7e0ceaff0d6c796876dfb2ce314a7d5f37a3fa Mon Sep 17 00:00:00 2001 From: Richard Cordovano Date: Mon, 23 Dec 2019 12:15:45 -0500 Subject: [PATCH 4/4] Clean up in TextFileExtractor --- .../sleuthkit/autopsy/textextractors/TextFileExtractor.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java index aa99672b1d..e6c52fb19c 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java @@ -69,7 +69,8 @@ public final class TextFileExtractor implements TextExtractor { // detection library to use. If CharsetDetector's own confidence is at least // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding. // Otherwise, Decodetect will be used. - // NOte: We initially used a confidence of 35, but it was causing some + // + // Note: We initially used a confidence of 35, but it was causing some // Chrome Cache files to get flagged as UTF-16 with confidence 40. // These files had a small amount of binary data and then ASCII. static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41; @@ -124,7 +125,7 @@ public final class TextFileExtractor implements TextExtractor { // Encoding detection is hard. We use several libraries since the data passed in is often messy. // First try CharsetDetector (from Tika / ICU4J). - // It is a rule-baesd detection approach. + // It is a rule-based detection approach. try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) { CharsetDetector detector = new CharsetDetector(); detector.setText(stream);