diff --git a/Core/ivy.xml b/Core/ivy.xml
index 5949f5a777..0088cc75ba 100644
--- a/Core/ivy.xml
+++ b/Core/ivy.xml
@@ -35,6 +35,11 @@
+
+
+
+
diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties
index b18bef9b6a..4df5159bd6 100644
--- a/Core/nbproject/project.properties
+++ b/Core/nbproject/project.properties
@@ -1,26 +1,59 @@
file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar
+file.reference.apache-mime4j-core-0.8.1.jar=release/modules/ext/apache-mime4j-core-0.8.1.jar
+file.reference.apache-mime4j-dom-0.8.1.jar=release/modules/ext/apache-mime4j-dom-0.8.1.jar
+file.reference.asm-5.0.4.jar=release/modules/ext/asm-5.0.4.jar
+file.reference.bcmail-jdk15on-1.54.jar=release/modules/ext/bcmail-jdk15on-1.54.jar
+file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
+file.reference.boilerpipe-1.1.0.jar=release/modules/ext/boilerpipe-1.1.0.jar
file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar
+file.reference.cdm-4.5.5.jar=release/modules/ext/cdm-4.5.5.jar
+file.reference.commons-codec-1.6.jar=release/modules/ext/commons-codec-1.6.jar
file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar
-file.reference.commons-dbcp2-2.1.1.jar=release\\modules\\ext\\commons-dbcp2-2.1.1.jar
-file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4.2.jar
+file.reference.commons-dbcp2-2.1.1.jar=release/modules/ext/commons-dbcp2-2.1.1.jar
+file.reference.commons-io-2.5.jar=release/modules/ext/commons-io-2.5.jar
+file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar
+file.reference.geoapi-3.0.0.jar=release/modules/ext/geoapi-3.0.0.jar
+file.reference.grib-4.5.5.jar=release/modules/ext/grib-4.5.5.jar
+file.reference.gson-2.8.1.jar=release/modules/ext/gson-2.8.1.jar
+file.reference.httpservices-4.5.5.jar=release/modules/ext/httpservices-4.5.5.jar
+file.reference.isoparser-1.1.18.jar=release/modules/ext/isoparser-1.1.18.jar
+file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
+file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
+file.reference.java-libpst-0.8.1.jar=release/modules/ext/java-libpst-0.8.1.jar
+file.reference.jcl-over-slf4j-1.7.24.jar=release/modules/ext/jcl-over-slf4j-1.7.24.jar
file.reference.jackson-core-2.9.7.jar=release/modules/ext/jackson-core-2.9.7.jar
file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar
file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar
+file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar
+file.reference.jhighlight-1.0.2.jar=release/modules/ext/jhighlight-1.0.2.jar
+file.reference.jmatio-1.2.jar=release/modules/ext/jmatio-1.2.jar
+file.reference.json-1.8.jar=release/modules/ext/json-1.8.jar
+file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar
+file.reference.jul-to-slf4j-1.7.24.jar=release/modules/ext/jul-to-slf4j-1.7.24.jar
+file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
+file.reference.junrar-0.7.jar=release/modules/ext/junrar-0.7.jar
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
file.reference.mchange-commons-java-0.2.9.jar=release/modules/ext/mchange-commons-java-0.2.9.jar
file.reference.metadata-extractor-2.10.1.jar=release/modules/ext/metadata-extractor-2.10.1.jar
+file.reference.netcdf4-4.5.5.jar=release/modules/ext/netcdf4-4.5.5.jar
+file.reference.opennlp-tools-1.8.3.jar=release/modules/ext/opennlp-tools-1.8.3.jar
+file.reference.poi-3.17.jar=release/modules/ext/poi-3.17.jar
+file.reference.poi-ooxml-3.17.jar=release/modules/ext/poi-ooxml-3.17.jar
+file.reference.poi-scratchpad-3.17.jar=release/modules/ext/poi-scratchpad-3.17.jar
file.reference.postgresql-9.4.1211.jre7.jar=release/modules/ext/postgresql-9.4.1211.jre7.jar
file.reference.Rejistry-1.0-SNAPSHOT.jar=release/modules/ext/Rejistry-1.0-SNAPSHOT.jar
+file.reference.rome-1.5.1.jar=release/modules/ext/rome-1.5.1.jar
file.reference.sevenzipjbinding-AllPlatforms.jar=release/modules/ext/sevenzipjbinding-AllPlatforms.jar
file.reference.sevenzipjbinding.jar=release/modules/ext/sevenzipjbinding.jar
-file.reference.sqlite-jdbc-3.8.11.jar=release\\modules\\ext\\sqlite-jdbc-3.8.11.jar
+file.reference.sis-metadata-0.6.jar=release/modules/ext/sis-metadata-0.6.jar
+file.reference.sis-netcdf-0.6.jar=release/modules/ext/sis-netcdf-0.6.jar
+file.reference.sis-utility-0.6.jar=release/modules/ext/sis-utility-0.6.jar
+file.reference.slf4j-api-1.7.24.jar=release/modules/ext/slf4j-api-1.7.24.jar
+file.reference.sqlite-jdbc-3.8.11.jar=release/modules/ext/sqlite-jdbc-3.8.11.jar
file.reference.StixLib.jar=release/modules/ext/StixLib.jar
-file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
-file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
-file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
file.reference.jempbox-1.8.13.jar=release/modules/ext/jempbox-1.8.13.jar
file.reference.javax.ws.rs-api-2.0.1.jar=release/modules/ext/javax.ws.rs-api-2.0.1.jar
file.reference.cxf-core-3.0.16.jar=release/modules/ext/cxf-core-3.0.16.jar
@@ -31,11 +64,14 @@ file.reference.fontbox-2.0.8.jar=release/modules/ext/fontbox-2.0.8.jar
file.reference.pdfbox-2.0.8.jar=release/modules/ext/pdfbox-2.0.8.jar
file.reference.pdfbox-tools-2.0.8.jar=release/modules/ext/pdfbox-tools-2.0.8.jar
file.reference.sleuthkit-postgresql-4.6.4.jar=release/modules/ext/sleuthkit-postgresql-4.6.4.jar
+file.reference.tagsoup-1.2.1.jar=release/modules/ext/tagsoup-1.2.1.jar
file.reference.tika-core-1.17.jar=release/modules/ext/tika-core-1.17.jar
file.reference.tika-parsers-1.17.jar=release/modules/ext/tika-parsers-1.17.jar
file.reference.curator-client-2.8.0.jar=release/modules/ext/curator-client-2.8.0.jar
file.reference.curator-framework-2.8.0.jar=release/modules/ext/curator-framework-2.8.0.jar
file.reference.curator-recipes-2.8.0.jar=release/modules/ext/curator-recipes-2.8.0.jar
+file.reference.vorbis-java-core-0.8.jar=release/modules/ext/vorbis-java-core-0.8.jar
+file.reference.vorbis-java-tika-0.8.jar=release/modules/ext/vorbis-java-tika-0.8.jar
file.reference.xmpcore-5.1.3.jar=release/modules/ext/xmpcore-5.1.3.jar
file.reference.xz-1.6.jar=release/modules/ext/xz-1.6.jar
file.reference.zookeeper-3.4.6.jar=release/modules/ext/zookeeper-3.4.6.jar
diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml
index d6f3562663..0e8b544024 100644
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@@ -338,81 +338,59 @@
org.sleuthkit.autopsy.modules.vmextractor
org.sleuthkit.autopsy.progress
org.sleuthkit.autopsy.report
+ org.sleuthkit.autopsy.textextractors
+ org.sleuthkit.autopsy.textextractors.extractionconfigs
org.sleuthkit.autopsy.texttranslation
org.sleuthkit.datamodel
+
+ ext/apache-mime4j-dom-0.8.1.jar
+ release/modules/ext/apache-mime4j-dom-0.8.1.jar
+
ext/jackcess-2.2.0.jar
release/modules/ext/jackcess-2.2.0.jar
- ext/zookeeper-3.4.6.jar
- release/modules/ext/zookeeper-3.4.6.jar
+ ext/jericho-html-3.3.jar
+ release/modules/ext/jericho-html-3.3.jar
- ext/jdom-2.0.5.jar
- release/modules/ext/jdom-2.0.5.jar
+ ext/cdm-4.5.5.jar
+ release/modules/ext/cdm-4.5.5.jar
- ext/cxf-rt-transports-http-3.0.16.jar
- release/modules/ext/cxf-rt-transports-http-3.0.16.jar
+ ext/httpservices-4.5.5.jar
+ release/modules/ext/httpservices-4.5.5.jar
ext/commons-validator-1.6.jar
release/modules/ext/commons-validator-1.6.jar
-
- ext/curator-framework-2.8.0.jar
- release/modules/ext/curator-framework-2.8.0.jar
-
-
- ext/bcprov-jdk15on-1.54.jar
- release/modules/ext/bcprov-jdk15on-1.54.jar
-
ext/commons-compress-1.14.jar
release/modules/ext/commons-compress-1.14.jar
- ext/fontbox-2.0.8.jar
- release/modules/ext/fontbox-2.0.8.jar
+ ext/geoapi-3.0.0.jar
+ release/modules/ext/geoapi-3.0.0.jar
- ext/commons-dbcp2-2.1.1.jar
- release\modules\ext\commons-dbcp2-2.1.1.jar
-
-
- ext/jgraphx-v3.8.0.jar
- release/modules/ext/jgraphx-v3.8.0.jar
-
-
- ext/jython-standalone-2.7.0.jar
- release/modules/ext/jython-standalone-2.7.0.jar
+ ext/boilerpipe-1.1.0.jar
+ release/modules/ext/boilerpipe-1.1.0.jar
ext/sevenzipjbinding.jar
release/modules/ext/sevenzipjbinding.jar
- ext/sleuthkit-postgresql-4.6.4.jar
- release/modules/ext/sleuthkit-postgresql-4.6.4.jar
+ ext/bcmail-jdk15on-1.54.jar
+ release/modules/ext/bcmail-jdk15on-1.54.jar
ext/mchange-commons-java-0.2.9.jar
release/modules/ext/mchange-commons-java-0.2.9.jar
-
- ext/cxf-core-3.0.16.jar
- release/modules/ext/cxf-core-3.0.16.jar
-
-
- ext/javax.ws.rs-api-2.0.1.jar
- release/modules/ext/javax.ws.rs-api-2.0.1.jar
-
-
- ext/postgresql-9.4.1211.jre7.jar
- release/modules/ext/postgresql-9.4.1211.jre7.jar
-
ext/curator-recipes-2.8.0.jar
release/modules/ext/curator-recipes-2.8.0.jar
@@ -421,6 +399,14 @@
ext/metadata-extractor-2.10.1.jar
release/modules/ext/metadata-extractor-2.10.1.jar
+
+ ext/apache-mime4j-core-0.8.1.jar
+ release/modules/ext/apache-mime4j-core-0.8.1.jar
+
+
+ ext/tagsoup-1.2.1.jar
+ release/modules/ext/tagsoup-1.2.1.jar
+
ext/tika-core-1.17.jar
release/modules/ext/tika-core-1.17.jar
@@ -429,45 +415,37 @@
ext/StixLib.jar
release/modules/ext/StixLib.jar
-
- ext/curator-client-2.8.0.jar
- release/modules/ext/curator-client-2.8.0.jar
-
-
- ext/jackson-core-2.9.7.jar
- release/modules/ext/jackson-core-2.9.7.jar
-
-
- ext/cxf-rt-frontend-jaxrs-3.0.16.jar
- release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar
-
ext/pdfbox-tools-2.0.8.jar
release/modules/ext/pdfbox-tools-2.0.8.jar
+
+ ext/asm-5.0.4.jar
+ release/modules/ext/asm-5.0.4.jar
+
+
+ ext/jcl-over-slf4j-1.7.24.jar
+ release/modules/ext/jcl-over-slf4j-1.7.24.jar
+
ext/tika-parsers-1.17.jar
release/modules/ext/tika-parsers-1.17.jar
ext/sqlite-jdbc-3.8.11.jar
- release\modules\ext\sqlite-jdbc-3.8.11.jar
+ release/modules/ext/sqlite-jdbc-3.8.11.jar
- ext/activemq-all-5.11.1.jar
- release/modules/ext/activemq-all-5.11.1.jar
+ ext/json-simple-1.1.1.jar
+ release/modules/ext/json-simple-1.1.1.jar
- ext/xz-1.6.jar
- release/modules/ext/xz-1.6.jar
+ ext/sis-utility-0.6.jar
+ release/modules/ext/sis-utility-0.6.jar
- ext/Rejistry-1.0-SNAPSHOT.jar
- release/modules/ext/Rejistry-1.0-SNAPSHOT.jar
-
-
- ext/dd-plist-1.20.jar
- release/modules/ext/dd-plist-1.20.jar
+ ext/jhighlight-1.0.2.jar
+ release/modules/ext/jhighlight-1.0.2.jar
ext/jempbox-1.8.13.jar
@@ -477,21 +455,9 @@
ext/cxf-rt-rs-client-3.0.16.jar
release/modules/ext/cxf-rt-rs-client-3.0.16.jar
-
- ext/sevenzipjbinding-AllPlatforms.jar
- release/modules/ext/sevenzipjbinding-AllPlatforms.jar
-
ext/commons-pool2-2.4.2.jar
- release\modules\ext\commons-pool2-2.4.2.jar
-
-
- ext/jackcess-encrypt-2.1.4.jar
- release/modules/ext/jackcess-encrypt-2.1.4.jar
-
-
- ext/jsoup-1.10.3.jar
- release/modules/ext/jsoup-1.10.3.jar
+ release/modules/ext/commons-pool2-2.4.2.jar
ext/jdom-2.0.5-contrib.jar
@@ -513,6 +479,190 @@
ext/xmpcore-5.1.3.jar
release/modules/ext/xmpcore-5.1.3.jar
+
+ ext/zookeeper-3.4.6.jar
+ release/modules/ext/zookeeper-3.4.6.jar
+
+
+ ext/jdom-2.0.5.jar
+ release/modules/ext/jdom-2.0.5.jar
+
+
+ ext/cxf-rt-transports-http-3.0.16.jar
+ release/modules/ext/cxf-rt-transports-http-3.0.16.jar
+
+
+ ext/sis-metadata-0.6.jar
+ release/modules/ext/sis-metadata-0.6.jar
+
+
+ ext/isoparser-1.1.18.jar
+ release/modules/ext/isoparser-1.1.18.jar
+
+
+ ext/sleuthkit-postgresql-4.6.4.jar
+ release/modules/ext/sleuthkit-postgresql-4.6.4.jar
+
+
+ ext/vorbis-java-core-0.8.jar
+ release/modules/ext/vorbis-java-core-0.8.jar
+
+
+ ext/commons-codec-1.6.jar
+ release/modules/ext/commons-codec-1.6.jar
+
+
+ ext/netcdf4-4.5.5.jar
+ release/modules/ext/netcdf4-4.5.5.jar
+
+
+ ext/slf4j-api-1.7.24.jar
+ release/modules/ext/slf4j-api-1.7.24.jar
+
+
+ ext/java-libpst-0.8.1.jar
+ release/modules/ext/java-libpst-0.8.1.jar
+
+
+ ext/jul-to-slf4j-1.7.24.jar
+ release/modules/ext/jul-to-slf4j-1.7.24.jar
+
+
+ ext/gson-2.8.1.jar
+ release/modules/ext/gson-2.8.1.jar
+
+
+ ext/poi-3.17.jar
+ release/modules/ext/poi-3.17.jar
+
+
+ ext/poi-scratchpad-3.17.jar
+ release/modules/ext/poi-scratchpad-3.17.jar
+
+
+ ext/sis-netcdf-0.6.jar
+ release/modules/ext/sis-netcdf-0.6.jar
+
+
+ ext/commons-io-2.5.jar
+ release/modules/ext/commons-io-2.5.jar
+
+
+ ext/curator-framework-2.8.0.jar
+ release/modules/ext/curator-framework-2.8.0.jar
+
+
+ ext/bcprov-jdk15on-1.54.jar
+ release/modules/ext/bcprov-jdk15on-1.54.jar
+
+
+ ext/fontbox-2.0.8.jar
+ release/modules/ext/fontbox-2.0.8.jar
+
+
+ ext/commons-dbcp2-2.1.1.jar
+ release/modules/ext/commons-dbcp2-2.1.1.jar
+
+
+ ext/jgraphx-v3.8.0.jar
+ release/modules/ext/jgraphx-v3.8.0.jar
+
+
+ ext/juniversalchardet-1.0.3.jar
+ release/modules/ext/juniversalchardet-1.0.3.jar
+
+
+ ext/jython-standalone-2.7.0.jar
+ release/modules/ext/jython-standalone-2.7.0.jar
+
+
+ ext/jackcess-encrypt-2.1.4.jar
+ release/modules/ext/jackcess-encrypt-2.1.4.jar
+
+
+ ext/cxf-core-3.0.16.jar
+ release/modules/ext/cxf-core-3.0.16.jar
+
+
+ ext/javax.ws.rs-api-2.0.1.jar
+ release/modules/ext/javax.ws.rs-api-2.0.1.jar
+
+
+ ext/opennlp-tools-1.8.3.jar
+ release/modules/ext/opennlp-tools-1.8.3.jar
+
+
+ ext/junrar-0.7.jar
+ release/modules/ext/junrar-0.7.jar
+
+
+ ext/postgresql-9.4.1211.jre7.jar
+ release/modules/ext/postgresql-9.4.1211.jre7.jar
+
+
+ ext/poi-ooxml-3.17.jar
+ release/modules/ext/poi-ooxml-3.17.jar
+
+
+ ext/curator-client-2.8.0.jar
+ release/modules/ext/curator-client-2.8.0.jar
+
+
+ ext/jackson-core-2.9.7.jar
+ release/modules/ext/jackson-core-2.9.7.jar
+
+
+ ext/cxf-rt-frontend-jaxrs-3.0.16.jar
+ release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar
+
+
+ ext/grib-4.5.5.jar
+ release/modules/ext/grib-4.5.5.jar
+
+
+ ext/jackson-core-2.9.2.jar
+ release/modules/ext/jackson-core-2.9.2.jar
+
+
+ ext/activemq-all-5.11.1.jar
+ release/modules/ext/activemq-all-5.11.1.jar
+
+
+ ext/xz-1.6.jar
+ release/modules/ext/xz-1.6.jar
+
+
+ ext/Rejistry-1.0-SNAPSHOT.jar
+ release/modules/ext/Rejistry-1.0-SNAPSHOT.jar
+
+
+ ext/dd-plist-1.20.jar
+ release/modules/ext/dd-plist-1.20.jar
+
+
+ ext/rome-1.5.1.jar
+ release/modules/ext/rome-1.5.1.jar
+
+
+ ext/sevenzipjbinding-AllPlatforms.jar
+ release/modules/ext/sevenzipjbinding-AllPlatforms.jar
+
+
+ ext/jmatio-1.2.jar
+ release/modules/ext/jmatio-1.2.jar
+
+
+ ext/jsoup-1.10.3.jar
+ release/modules/ext/jsoup-1.10.3.jar
+
+
+ ext/vorbis-java-tika-0.8.jar
+ release/modules/ext/vorbis-java-tika-0.8.jar
+
+
+ ext/json-1.8.jar
+ release/modules/ext/json-1.8.jar
+
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
new file mode 100644
index 0000000000..ba91a6cc3a
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -0,0 +1,89 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011-2018 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.textextractors;
+
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.io.IOUtils;
+import org.sleuthkit.autopsy.datamodel.ContentUtils;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.BlackboardAttribute;
+import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.TskCoreException;
+
+/**
+ * Extracts text from artifacts by concatenating the values of all of the
+ * artifact's attributes.
+ */
+class ArtifactTextExtractor extends TextExtractor {
+
+ private final BlackboardArtifact artifact;
+
+ public ArtifactTextExtractor(Content artifact) {
+ this.artifact = (BlackboardArtifact) artifact;
+ }
+
+ @Override
+ public Reader getReader() throws ExtractionException {
+ // Concatenate the string values of all attributes into a single
+ // "content" string to be indexed.
+ StringBuilder artifactContents = new StringBuilder();
+
+ Content dataSource = null;
+ try {
+ dataSource = artifact.getDataSource();
+ } catch (TskCoreException tskCoreException) {
+ throw new ExtractionException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
+ }
+ if (dataSource == null) {
+ throw new ExtractionException("Datasource was null for artifact: " + artifact.toString());
+ }
+
+ try {
+ for (BlackboardAttribute attribute : artifact.getAttributes()) {
+ artifactContents.append(attribute.getAttributeType().getDisplayName());
+ artifactContents.append(" : ");
+ // We have also discussed modifying BlackboardAttribute.getDisplayString()
+ // to magically format datetime attributes but that is complicated by
+ // the fact that BlackboardAttribute exists in Sleuthkit data model
+ // while the utility to determine the timezone to use is in ContentUtils
+ // in the Autopsy datamodel.
+ switch (attribute.getValueType()) {
+ case DATETIME:
+ artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
+ break;
+ default:
+ artifactContents.append(attribute.getDisplayString());
+ }
+ artifactContents.append(System.lineSeparator());
+ }
+ } catch (TskCoreException tskCoreException) {
+ throw new ExtractionException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
+ }
+
+ return new InputStreamReader(IOUtils.toInputStream(artifactContents,
+ StandardCharsets.UTF_8), StandardCharsets.UTF_8);
+ }
+
+ @Override
+ public boolean isSupported(Content file, String detectedFormat) {
+ return true;
+ }
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
similarity index 81%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index 32842dbc03..86dbd15c1b 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
import java.io.IOException;
import java.io.Reader;
@@ -38,10 +38,11 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extracts text from HTML content.
*/
-class HtmlTextExtractor extends ContentTextExtractor {
+final class HtmlTextExtractor extends TextExtractor {
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
- private static final int MAX_SIZE = 50_000_000; //50MB
+ private final int MAX_SIZE;
+ private final Content file;
static final List WEB_MIME_TYPES = Arrays.asList(
"application/javascript", //NON-NLS
@@ -51,27 +52,51 @@ class HtmlTextExtractor extends ContentTextExtractor {
"text/html", //NON-NLS NON-NLS
"text/javascript" //NON-NLS
);
-
+
static {
// Disable Jericho HTML Parser log messages.
Config.LoggerProvider = LoggerProvider.DISABLED;
}
- @Override
- boolean isContentTypeSpecific() {
- return true;
+ /**
+ * Creates a default instance of the HtmlTextExtractor. Supported file size
+ * is 50MB.
+ */
+ public HtmlTextExtractor(Content file) {
+ //Set default to be 50 MB.
+ MAX_SIZE = 50_000_000;
+ this.file = file;
}
+ /**
+ * Determines if this content type is supported by this extractor.
+ *
+ * @param content Content instance to be analyzed
+ * @param detectedFormat Mimetype of content instance
+ *
+ * @return flag indicating support
+ */
@Override
- boolean isSupported(Content content, String detectedFormat) {
+ public boolean isSupported(Content content, String detectedFormat) {
return detectedFormat != null
&& WEB_MIME_TYPES.contains(detectedFormat)
&& content.getSize() <= MAX_SIZE;
}
+ /**
+ * Returns a reader that will iterate over the text of an HTML document.
+ *
+ * @param content Html document source
+ *
+ * @return A reader instance containing the document source text
+ *
+ * @throws TextExtractorException
+ */
@Override
- public Reader getReader(Content content) throws TextExtractorException {
- ReadContentInputStream stream = new ReadContentInputStream(content);
+ public Reader getReader() throws ExtractionException {
+ //TODO JIRA-4467, there is only harm in excluding HTML documents greater
+ //than 50MB due to our troubled approach of extraction.
+ ReadContentInputStream stream = new ReadContentInputStream(file);
//Parse the stream with Jericho and put the results in a Reader
try {
@@ -164,17 +189,8 @@ class HtmlTextExtractor extends ContentTextExtractor {
// All done, now make it a reader
return new StringReader(stringBuilder.toString());
} catch (IOException ex) {
- throw new TextExtractorException("Error extracting HTML from content.", ex);
+ logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
+ throw new ExtractionException("Error extracting HTML from content.", ex);
}
}
-
- @Override
- public boolean isDisabled() {
- return false;
- }
-
- @Override
- public void logWarning(final String msg, Exception ex) {
- logger.log(Level.WARNING, msg, ex); //NON-NLS }
- }
}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
similarity index 80%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index f7fff3c134..ea204d5e30 100755
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -1,24 +1,23 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2018-2018 Basis Technology Corp.
- * Contact: carrier sleuthkit org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2018 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
-import com.google.common.io.CharSource;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
@@ -28,37 +27,27 @@ import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
-import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
/**
- * Dedicated SqliteTextExtractor to solve the problems associated with Tika's
- * Sqlite parser.
+ * Extracts text from SQLite database files.
*
- * Tika problems: 1) Tika fails to open virtual tables 2) Tika fails to open
- * tables with spaces in table name 3) Tika fails to include the table names in
- * output (except for the first table it parses)
+ * This is a dedicated solution to address the problems associated with
+ * Tika's sqlite parser (version 1.17), which include the following:
+ * 1) Virtual tables cause the parser to bail
+ * 2) Tables that contain spaces in their name are not extracted
+ * 3) Table names are not included in its output text
*/
-class SqliteTextExtractor extends ContentTextExtractor {
+final class SqliteTextExtractor extends TextExtractor {
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
+ private final AbstractFile file;
- @Override
- boolean isContentTypeSpecific() {
- return true;
+ public SqliteTextExtractor(Content file) {
+ this.file = (AbstractFile) file;
}
-
- @Override
- public boolean isDisabled() {
- return false;
- }
-
- @Override
- public void logWarning(String msg, Exception exception) {
- logger.log(Level.WARNING, msg, exception); //NON-NLS
- }
-
/**
* Supports only the sqlite mimetypes
*
@@ -68,44 +57,34 @@ class SqliteTextExtractor extends ContentTextExtractor {
* @return true if x-sqlite3
*/
@Override
- boolean isSupported(Content file, String detectedFormat) {
+ public boolean isSupported(Content file, String detectedFormat) {
return SQLITE_MIMETYPE.equals(detectedFormat);
}
/**
- * Returns a stream that will read from a sqlite database.
+ * Returns a reader that will iterate over the text of a sqlite database.
*
* @param source Content file
*
- * @return An InputStream that reads from a Sqlite database.
+ * @return An InputStream that reads from a Sqlite database
*
- * @throws
- * org.sleuthkit.autopsy.keywordsearch.TextExtractor.TextExtractorException
+ * @throws TextExtractorException
*/
@Override
- public Reader getReader(Content source) throws TextExtractorException {
- //Firewall for any content that is not an AbstractFile
- if (!AbstractFile.class.isInstance(source)) {
- try {
- return CharSource.wrap("").openStream();
- } catch (IOException ex) {
- throw new TextExtractorException("", ex);
- }
- }
-
- return new SQLiteStreamReader((AbstractFile) source);
+ public Reader getReader() throws ExtractionException {
+ return new SQLiteStreamReader(file);
}
-
+
/**
* Produces a continuous stream of characters from a database file. To
* achieve this, all table names are queues up and a SQLiteTableReader is
* used to do the actual queries and table iteration.
*/
- public class SQLiteStreamReader extends Reader {
+ private class SQLiteStreamReader extends Reader {
private final SQLiteTableReader reader;
private final AbstractFile file;
-
+
private Iterator tableNames;
private String currentTableName;
@@ -217,9 +196,10 @@ class SqliteTextExtractor extends ContentTextExtractor {
}
/**
- * Reads database values into the buffer. This function is responsible for
- * getting the next table in the queue, initiating calls to the SQLiteTableReader,
- * and filling in any excess bytes that are lingering from the previous call.
+ * Reads database values into the buffer. This function is responsible
+ * for getting the next table in the queue, initiating calls to the
+ * SQLiteTableReader, and filling in any excess bytes that are lingering
+ * from the previous call.
*
* @throws IOException
*/
@@ -255,9 +235,9 @@ class SqliteTextExtractor extends ContentTextExtractor {
reader.read(currentTableName, () -> bufIndex == len);
} catch (SQLiteTableReaderException ex) {
logger.log(Level.WARNING, String.format(
- "Error attempting to read file table: [%s]" //NON-NLS
- + " for file: [%s] (id=%d).", currentTableName, //NON-NLS
- file.getName(), file.getId()), ex.getMessage());
+ "Error attempting to read file table: [%s]" //NON-NLS
+ + " for file: [%s] (id=%d).", currentTableName, //NON-NLS
+ file.getName(), file.getId()), ex.getMessage());
}
} else {
if (bufIndex == off) {
@@ -290,8 +270,8 @@ class SqliteTextExtractor extends ContentTextExtractor {
}
/**
- * Wrapper that holds the excess bytes that were left over from the previous
- * call to read().
+ * Wrapper that holds the excess bytes that were left over from the
+ * previous call to read().
*/
private class ExcessBytes {
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
similarity index 85%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 391c7d5a7c..899cec9ef2 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -16,19 +16,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.textextractors;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.nio.charset.Charset;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
-import java.util.logging.Level;
-import org.sleuthkit.autopsy.coreutils.Logger;
+import java.util.Objects;
+import org.openide.util.Lookup;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
+import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskException;
@@ -36,24 +36,25 @@ import org.sleuthkit.datamodel.TskException;
/**
* Extracts raw strings from content.
*/
-class StringsTextExtractor extends ContentTextExtractor {
+final class StringsTextExtractor extends TextExtractor {
- static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
-
- /**
- * Options for this extractor
- */
- enum ExtractOptions {
- EXTRACT_UTF16, ///< extract UTF16 text, true/false
- EXTRACT_UTF8, ///< extract UTF8 text, true/false
- };
+ private boolean extractUTF8;
+ private boolean extractUTF16;
+ private final Content content;
+ private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
private final List