diff --git a/Core/ivy.xml b/Core/ivy.xml index 2958230052..5185b58c13 100644 --- a/Core/ivy.xml +++ b/Core/ivy.xml @@ -42,6 +42,9 @@ + + + @@ -54,6 +57,6 @@ - + diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties index aa5e50279c..3c994d58ac 100644 --- a/Core/nbproject/project.properties +++ b/Core/nbproject/project.properties @@ -18,6 +18,7 @@ file.reference.commons-lang3-3.8.1.jar=release\\modules\\ext\\commons-lang3-3.8. file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar +file.reference.decodetect-core-0.3.jar=release\\modules\\ext\\decodetect-core-0.3.jar file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar @@ -50,6 +51,7 @@ file.reference.jsoup-1.11.3.jar=release\\modules\\ext\\jsoup-1.11.3.jar file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar +file.reference.jutf7-1.0.0.jar=release\\modules\\ext\\jutf7-1.0.0.jar file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml index 7fe269c0fb..83aefea7c5 100644 --- a/Core/nbproject/project.xml +++ b/Core/nbproject/project.xml @@ -794,6 +794,14 @@ ext/vorbis-java-tika-0.8.jar release\modules\ext\vorbis-java-tika-0.8.jar + + ext/decodetect-core-0.3.jar + release/modules/ext/decodetect-core-0.3.jar + + + ext/jutf7-1.0.0.jar + release/modules/ext/jutf7-1.0.0.jar + diff --git a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java index 0c885472de..46b60d9b1e 100644 --- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java +++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java @@ -18,6 +18,7 @@ */ package org.sleuthkit.autopsy.modules.filetypeid; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -29,6 +30,7 @@ import org.apache.tika.Tika; import org.apache.tika.io.TikaInputStream; import org.apache.tika.mime.MimeTypes; import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.textextractors.TextFileExtractor; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.TskCoreException; @@ -249,6 +251,17 @@ public class FileTypeDetector { mimeType = tikaType.replace("tika-", ""); //NON-NLS mimeType = removeOptionalParameter(mimeType); } + } else { + /* + * If the file was marked as an octet stream and the extension is .txt, try to detect a text + * encoding with Decodetect. + */ + if (file.getNameExtension().equals("txt")) { + Charset detectedCharset = TextFileExtractor.getEncoding(file); + if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) { + mimeType = MimeTypes.PLAIN_TEXT; + } + } } /** diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java index 2c8316ba60..ff0ba51dd1 100755 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java @@ -89,6 +89,7 @@ public class TextExtractorFactory { */ private static List getFileExtractors(AbstractFile content, Lookup context) { List fileExtractors = Arrays.asList( + new TextFileExtractor(content), new HtmlTextExtractor(content), new SqliteTextExtractor(content), new TikaTextExtractor(content)); diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java new file mode 100644 index 0000000000..3efb6b1aed --- /dev/null +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java @@ -0,0 +1,135 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2018-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.textextractors; + +import com.ethteck.decodetect.core.Decodetect; +import com.ethteck.decodetect.core.DecodetectResult; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; +import java.util.List; +import org.apache.tika.parser.txt.CharsetDetector; +import org.apache.tika.parser.txt.CharsetMatch; +import org.sleuthkit.datamodel.AbstractFile; +import org.sleuthkit.datamodel.Content; +import org.sleuthkit.datamodel.ReadContentInputStream; + +/** + * Extract text from text files + */ +public final class TextFileExtractor implements TextExtractor { + public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) { + @Override + public boolean contains(Charset cs) { + return false; + } + + @Override + public CharsetDecoder newDecoder() { + return null; + } + + @Override + public CharsetEncoder newEncoder() { + return null; + } + }; + + // This value will be used as a threshold for determining which encoding + // detection library to use. If Tika's own confidence is at least + // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding. + // Otherwise, Decodetect will be used. + static final private int MIN_TIKA_MATCH_CONFIDENCE = 35; + + // This value determines whether we will consider Decodetect's top-scoring + // result a legitimate match or if we will disregard its findings + // + // Possible values are 0 to 1, inclusive + static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4; + + private final AbstractFile file; + + public TextFileExtractor(AbstractFile file) { + this.file = file; + } + + @Override + public Reader getReader() { + Charset encoding = getEncoding(file); + if (encoding.equals(UNKNOWN_CHARSET)) { + encoding = StandardCharsets.UTF_8; + } + return getReader(encoding); + } + + public Reader getReader(Charset encoding) { + return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding); + } + + @Override + public boolean isSupported() { + return file.getMIMEType().equals("text/plain"); + } + + public class TextFileExtractorException extends Exception { + public TextFileExtractorException(String msg, Throwable ex) { + super(msg, ex); + } + public TextFileExtractorException(String msg) { + super(msg); + } + } + + public static Charset getEncoding(Content content) { + try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) { + // Tika first + CharsetDetector detector = new CharsetDetector(); + detector.setText(stream); + CharsetMatch tikaResult = detector.detect(); + if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) { + try { + return Charset.forName(tikaResult.getName()); + } catch (UnsupportedCharsetException ignored) { + } + } + + // Decodetect if Tika fails or falls below confidence threshold + int maxBytes = 100000; + int numBytes = Math.min(stream.available(), maxBytes); + byte[] targetArray = new byte[numBytes]; + stream.read(targetArray); + List results = Decodetect.DECODETECT.getResults(targetArray); + if (!results.isEmpty()) { + DecodetectResult topResult = results.get(0); + if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) { + return topResult.getEncoding(); + } + } + } catch (IOException ignored) { + } + return UNKNOWN_CHARSET; + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index 6052e3deba..d4c9228c69 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.stream.Collectors; +import org.apache.tika.mime.MimeTypes; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.NbBundle.Messages; @@ -44,12 +45,12 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType; import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter; import org.sleuthkit.autopsy.ingest.IngestServices; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; -import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; import org.sleuthkit.autopsy.textextractors.TextExtractor; import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; +import org.sleuthkit.autopsy.textextractors.TextFileExtractor; import org.sleuthkit.autopsy.textextractors.configs.ImageConfig; import org.sleuthkit.autopsy.textextractors.configs.StringsConfig; import org.sleuthkit.datamodel.AbstractFile; @@ -632,7 +633,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { if (context.fileIngestIsCancelled()) { return; } - if (fileType.equals("application/octet-stream")) { + if (fileType.equals(MimeTypes.OCTET_STREAM)) { extractStringsAndIndex(aFile); return; } @@ -657,20 +658,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) { //Carved Files should be the only type of unallocated files capable of a txt extension and //should be ignored by the TextFileExtractor because they may contain more than one text encoding - try { - TextFileExtractor textFileExtractor = new TextFileExtractor(); - Reader textReader = textFileExtractor.getReader(aFile); - if (textReader == null) { - logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName()); - } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) { - putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED); - wasTextAdded = true; - } - } catch (IngesterException ex) { - logger.log(Level.WARNING, "Unable to index as unicode", ex); - } catch (TextFileExtractorException ex) { - logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex); - } + wasTextAdded = indexTextFile(aFile); } // if it wasn't supported or had an error, default to strings @@ -678,5 +666,29 @@ public final class KeywordSearchIngestModule implements FileIngestModule { extractStringsAndIndex(aFile); } } + + /** + * Adds the text file to the index given an encoding. + * Returns true if indexing was successful and false otherwise. + * + * @param aFile Text file to analyze + * @param detectedCharset the encoding of the file + */ + private boolean indexTextFile(AbstractFile aFile) { + try { + TextFileExtractor textFileExtractor = new TextFileExtractor(aFile); + Reader textReader = textFileExtractor.getReader(); + if (textReader == null) { + logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName()); + } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) { + textReader.close(); + putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED); + return true; + } + } catch (IngesterException | IOException ex) { + logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex); + } + return false; + } } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java deleted file mode 100644 index 66d26a95bf..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2018-2019 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; -import java.io.IOException; -import java.io.InputStream; -import java.io.BufferedInputStream; -import java.io.Reader; -import org.apache.tika.parser.txt.CharsetDetector; -import org.apache.tika.parser.txt.CharsetMatch; -import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.ReadContentInputStream; - -/** - * Extract text from .txt files - */ -final class TextFileExtractor { - - //Set a Minimum confidence value to reject matches that may not have a valid text encoding - //Values of valid text encodings were generally 100, xml code sometimes had a value around 50, - //and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing. - //This limited information was used to select the current value as one that would filter out clearly non-text - //files while hopefully working on all files with a valid text encoding - static final private int MIN_MATCH_CONFIDENCE = 20; - - public Reader getReader(AbstractFile source) throws TextFileExtractorException { - CharsetDetector detector = new CharsetDetector(); - //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector - InputStream stream = new BufferedInputStream(new ReadContentInputStream(source)); - try { - detector.setText(stream); - } catch (IOException ex) { - throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex); - } - CharsetMatch match = detector.detect(); - if (match == null) { - throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor"); - } else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) { - throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor"); - } - - return match.getReader(); - } - - public class TextFileExtractorException extends Exception { - public TextFileExtractorException(String msg, Throwable ex) { - super(msg, ex); - } - public TextFileExtractorException(String msg) { - super(msg); - } - } -}