diff --git a/Core/ivy.xml b/Core/ivy.xml
index 2958230052..5185b58c13 100644
--- a/Core/ivy.xml
+++ b/Core/ivy.xml
@@ -42,6 +42,9 @@
+
+
+
@@ -54,6 +57,6 @@
-
+
diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties
index aa5e50279c..3c994d58ac 100644
--- a/Core/nbproject/project.properties
+++ b/Core/nbproject/project.properties
@@ -18,6 +18,7 @@ file.reference.commons-lang3-3.8.1.jar=release\\modules\\ext\\commons-lang3-3.8.
file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar
file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar
+file.reference.decodetect-core-0.3.jar=release\\modules\\ext\\decodetect-core-0.3.jar
file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar
file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar
file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar
@@ -50,6 +51,7 @@ file.reference.jsoup-1.11.3.jar=release\\modules\\ext\\jsoup-1.11.3.jar
file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar
file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar
file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar
+file.reference.jutf7-1.0.0.jar=release\\modules\\ext\\jutf7-1.0.0.jar
file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar
diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml
index 7fe269c0fb..83aefea7c5 100644
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@@ -794,6 +794,14 @@
ext/vorbis-java-tika-0.8.jar
release\modules\ext\vorbis-java-tika-0.8.jar
+
+ ext/decodetect-core-0.3.jar
+ release/modules/ext/decodetect-core-0.3.jar
+
+
+ ext/jutf7-1.0.0.jar
+ release/modules/ext/jutf7-1.0.0.jar
+
diff --git a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
index 0c885472de..46b60d9b1e 100644
--- a/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/filetypeid/FileTypeDetector.java
@@ -18,6 +18,7 @@
*/
package org.sleuthkit.autopsy.modules.filetypeid;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@@ -29,6 +30,7 @@ import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MimeTypes;
import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.TskCoreException;
@@ -249,6 +251,17 @@ public class FileTypeDetector {
mimeType = tikaType.replace("tika-", ""); //NON-NLS
mimeType = removeOptionalParameter(mimeType);
}
+ } else {
+ /*
+ * If the file was marked as an octet stream and the extension is .txt, try to detect a text
+ * encoding with Decodetect.
+ */
+ if (file.getNameExtension().equals("txt")) {
+ Charset detectedCharset = TextFileExtractor.getEncoding(file);
+ if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
+ mimeType = MimeTypes.PLAIN_TEXT;
+ }
+ }
}
/**
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
index 2c8316ba60..ff0ba51dd1 100755
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -89,6 +89,7 @@ public class TextExtractorFactory {
*/
private static List getFileExtractors(AbstractFile content, Lookup context) {
List fileExtractors = Arrays.asList(
+ new TextFileExtractor(content),
new HtmlTextExtractor(content),
new SqliteTextExtractor(content),
new TikaTextExtractor(content));
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
new file mode 100644
index 0000000000..3efb6b1aed
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java
@@ -0,0 +1,135 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2019 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.textextractors;
+
+import com.ethteck.decodetect.core.Decodetect;
+import com.ethteck.decodetect.core.DecodetectResult;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.List;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.ReadContentInputStream;
+
+/**
+ * Extract text from text files
+ */
+public final class TextFileExtractor implements TextExtractor {
+ public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
+ @Override
+ public boolean contains(Charset cs) {
+ return false;
+ }
+
+ @Override
+ public CharsetDecoder newDecoder() {
+ return null;
+ }
+
+ @Override
+ public CharsetEncoder newEncoder() {
+ return null;
+ }
+ };
+
+ // This value will be used as a threshold for determining which encoding
+ // detection library to use. If Tika's own confidence is at least
+ // MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
+ // Otherwise, Decodetect will be used.
+ static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
+
+ // This value determines whether we will consider Decodetect's top-scoring
+ // result a legitimate match or if we will disregard its findings
+ //
+ // Possible values are 0 to 1, inclusive
+ static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
+
+ private final AbstractFile file;
+
+ public TextFileExtractor(AbstractFile file) {
+ this.file = file;
+ }
+
+ @Override
+ public Reader getReader() {
+ Charset encoding = getEncoding(file);
+ if (encoding.equals(UNKNOWN_CHARSET)) {
+ encoding = StandardCharsets.UTF_8;
+ }
+ return getReader(encoding);
+ }
+
+ public Reader getReader(Charset encoding) {
+ return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
+ }
+
+ @Override
+ public boolean isSupported() {
+ return file.getMIMEType().equals("text/plain");
+ }
+
+ public class TextFileExtractorException extends Exception {
+ public TextFileExtractorException(String msg, Throwable ex) {
+ super(msg, ex);
+ }
+ public TextFileExtractorException(String msg) {
+ super(msg);
+ }
+ }
+
+ public static Charset getEncoding(Content content) {
+ try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
+ // Tika first
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(stream);
+ CharsetMatch tikaResult = detector.detect();
+ if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
+ try {
+ return Charset.forName(tikaResult.getName());
+ } catch (UnsupportedCharsetException ignored) {
+ }
+ }
+
+ // Decodetect if Tika fails or falls below confidence threshold
+ int maxBytes = 100000;
+ int numBytes = Math.min(stream.available(), maxBytes);
+ byte[] targetArray = new byte[numBytes];
+ stream.read(targetArray);
+ List results = Decodetect.DECODETECT.getResults(targetArray);
+ if (!results.isEmpty()) {
+ DecodetectResult topResult = results.get(0);
+ if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
+ return topResult.getEncoding();
+ }
+ }
+ } catch (IOException ignored) {
+ }
+ return UNKNOWN_CHARSET;
+ }
+}
diff --git a/InternalPythonModules/android/fbmessenger.py b/InternalPythonModules/android/fbmessenger.py
index c19ae87796..e3514e2b17 100644
--- a/InternalPythonModules/android/fbmessenger.py
+++ b/InternalPythonModules/android/fbmessenger.py
@@ -316,19 +316,29 @@ class FBMessengerAnalyzer(general.AndroidComponentAnalyzer):
if (attachment is not None):
attachmentDict = json.loads(attachment)[0]
if (attachmentDict["mime_type"] == "image/jpeg"):
- urlAttachments = self.getJPGListFromJson(attachmentDict["urls"])
+ urls = attachmentDict.get("urls", None)
+ if (urls is not None):
+ urlAttachments = self.getJPGListFromJson(urls)
elif (attachmentDict["mime_type"] == "video/mp4"):
# filename does not have an associated path with it so it will be ignored
- urlAttachments = self.getJPGListFromJson(attachmentDict["urls"])
- urlAttachments.add(URLAttachment(attachmentDict["video_data_url"]))
- urlAttachments.add(URLAttachment(attachmentDict["video_data_thumbnail_url"]))
-
+
+ urls = attachmentDict.get("urls", None)
+ if (urls is not None):
+ urlAttachments = self.getJPGListFromJson(urls)
+
+ video_data_url = attachmentDict.get("video_data_url", None)
+ if (video_data_url is not None):
+ urlAttachments.add(URLAttachment(video_data_url))
+ video_data_thumbnail_url = attachmentDict.get("video_data_thumbnail_url", None)
+
+ if (video_data_thumbnail_url is not None):
+ urlAttachments.add(URLAttachment(video_data_thumbnail_url))
elif (attachmentDict["mime_type"] == "audio/mpeg"):
- if (attachmentDict["audio_uri"] == ""):
+ audioUri = attachmentDict.get("audio_uri", None)
+ if (audioUri is None or audioUri == ""):
continue
else:
- audioUri = attachmentDict["audio_uri"]
fileAttachments.add(FileAttachment(currentCase.getSleuthkitCase(), threadsDb.getDBFile().getDataSource(), audioUri.replace("file://","")))
else:
@@ -336,8 +346,9 @@ class FBMessengerAnalyzer(general.AndroidComponentAnalyzer):
if (pendingAttachment is not None):
pendingAttachmentDict = json.loads(pendingAttachment)[0]
- pendingAttachmentUri = pendingAttachmentDict["uri"]
- fileAttachments.add(FileAttachment(currentCase.getSleuthkitCase(), threadsDb.getDBFile().getDataSource(), pendingAttachmentUri.replace("file://","")))
+ pendingAttachmentUri = pendingAttachmentDict.get("uri", None)
+ if (pendingAttachmentUri is not None):
+ fileAttachments.add(FileAttachment(currentCase.getSleuthkitCase(), threadsDb.getDBFile().getDataSource(), pendingAttachmentUri.replace("file://","")))
messageAttachments = MessageAttachments(fileAttachments, urlAttachments)
diff --git a/InternalPythonModules/android/xender.py b/InternalPythonModules/android/xender.py
index aeddcaf4e1..e8c2b3d74f 100644
--- a/InternalPythonModules/android/xender.py
+++ b/InternalPythonModules/android/xender.py
@@ -98,7 +98,7 @@ class XenderAnalyzer(general.AndroidComponentAnalyzer):
Account.Type.XENDER)
queryString = """
- SELECT f_path, f_display_name, f_size_str, f_create_time, c_direction, c_session_id,
+ SELECT f_path, f_display_name, f_size_str, c_start_time, c_direction, c_session_id,
s_name, s_device_id, r_name, r_device_id
FROM new_history
"""
@@ -118,7 +118,7 @@ class XenderAnalyzer(general.AndroidComponentAnalyzer):
direction = CommunicationDirection.INCOMING
fromId = messagesResultSet.getString("s_device_id")
- timeStamp = messagesResultSet.getLong("f_create_time") / 1000
+ timeStamp = messagesResultSet.getLong("c_start_time") / 1000
messageArtifact = transactionDbHelper.addMessage(
self._MESSAGE_TYPE,
direction,
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 6052e3deba..d4c9228c69 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -28,6 +28,7 @@ import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.stream.Collectors;
+import org.apache.tika.mime.MimeTypes;
import org.openide.util.Lookup;
import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages;
@@ -44,12 +45,12 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
-import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
import org.sleuthkit.datamodel.AbstractFile;
@@ -632,7 +633,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) {
return;
}
- if (fileType.equals("application/octet-stream")) {
+ if (fileType.equals(MimeTypes.OCTET_STREAM)) {
extractStringsAndIndex(aFile);
return;
}
@@ -657,20 +658,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
//Carved Files should be the only type of unallocated files capable of a txt extension and
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
- try {
- TextFileExtractor textFileExtractor = new TextFileExtractor();
- Reader textReader = textFileExtractor.getReader(aFile);
- if (textReader == null) {
- logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
- } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
- putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
- wasTextAdded = true;
- }
- } catch (IngesterException ex) {
- logger.log(Level.WARNING, "Unable to index as unicode", ex);
- } catch (TextFileExtractorException ex) {
- logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
- }
+ wasTextAdded = indexTextFile(aFile);
}
// if it wasn't supported or had an error, default to strings
@@ -678,5 +666,29 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
extractStringsAndIndex(aFile);
}
}
+
+ /**
+ * Adds the text file to the index given an encoding.
+ * Returns true if indexing was successful and false otherwise.
+ *
+ * @param aFile Text file to analyze
+ * @param detectedCharset the encoding of the file
+ */
+ private boolean indexTextFile(AbstractFile aFile) {
+ try {
+ TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
+ Reader textReader = textFileExtractor.getReader();
+ if (textReader == null) {
+ logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
+ } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
+ textReader.close();
+ putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
+ return true;
+ }
+ } catch (IngesterException | IOException ex) {
+ logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
+ }
+ return false;
+ }
}
}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
deleted file mode 100644
index 66d26a95bf..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2018-2019 Basis Technology Corp.
- * Contact: carrier sleuthkit org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.keywordsearch;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.BufferedInputStream;
-import java.io.Reader;
-import org.apache.tika.parser.txt.CharsetDetector;
-import org.apache.tika.parser.txt.CharsetMatch;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.ReadContentInputStream;
-
-/**
- * Extract text from .txt files
- */
-final class TextFileExtractor {
-
- //Set a Minimum confidence value to reject matches that may not have a valid text encoding
- //Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
- //and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
- //This limited information was used to select the current value as one that would filter out clearly non-text
- //files while hopefully working on all files with a valid text encoding
- static final private int MIN_MATCH_CONFIDENCE = 20;
-
- public Reader getReader(AbstractFile source) throws TextFileExtractorException {
- CharsetDetector detector = new CharsetDetector();
- //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
- InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
- try {
- detector.setText(stream);
- } catch (IOException ex) {
- throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
- }
- CharsetMatch match = detector.detect();
- if (match == null) {
- throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
- } else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
- throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
- }
-
- return match.getReader();
- }
-
- public class TextFileExtractorException extends Exception {
- public TextFileExtractorException(String msg, Throwable ex) {
- super(msg, ex);
- }
- public TextFileExtractorException(String msg) {
- super(msg);
- }
- }
-}