diff --git a/Core/ivy.xml b/Core/ivy.xml index 5949f5a777..0088cc75ba 100644 --- a/Core/ivy.xml +++ b/Core/ivy.xml @@ -35,6 +35,11 @@ + + + + diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties index b18bef9b6a..4df5159bd6 100644 --- a/Core/nbproject/project.properties +++ b/Core/nbproject/project.properties @@ -1,26 +1,59 @@ file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar +file.reference.apache-mime4j-core-0.8.1.jar=release/modules/ext/apache-mime4j-core-0.8.1.jar +file.reference.apache-mime4j-dom-0.8.1.jar=release/modules/ext/apache-mime4j-dom-0.8.1.jar +file.reference.asm-5.0.4.jar=release/modules/ext/asm-5.0.4.jar +file.reference.bcmail-jdk15on-1.54.jar=release/modules/ext/bcmail-jdk15on-1.54.jar +file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar +file.reference.boilerpipe-1.1.0.jar=release/modules/ext/boilerpipe-1.1.0.jar file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar +file.reference.cdm-4.5.5.jar=release/modules/ext/cdm-4.5.5.jar +file.reference.commons-codec-1.6.jar=release/modules/ext/commons-codec-1.6.jar file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar -file.reference.commons-dbcp2-2.1.1.jar=release\\modules\\ext\\commons-dbcp2-2.1.1.jar -file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4.2.jar +file.reference.commons-dbcp2-2.1.1.jar=release/modules/ext/commons-dbcp2-2.1.1.jar +file.reference.commons-io-2.5.jar=release/modules/ext/commons-io-2.5.jar +file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar +file.reference.geoapi-3.0.0.jar=release/modules/ext/geoapi-3.0.0.jar +file.reference.grib-4.5.5.jar=release/modules/ext/grib-4.5.5.jar +file.reference.gson-2.8.1.jar=release/modules/ext/gson-2.8.1.jar +file.reference.httpservices-4.5.5.jar=release/modules/ext/httpservices-4.5.5.jar +file.reference.isoparser-1.1.18.jar=release/modules/ext/isoparser-1.1.18.jar +file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar +file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar +file.reference.java-libpst-0.8.1.jar=release/modules/ext/java-libpst-0.8.1.jar +file.reference.jcl-over-slf4j-1.7.24.jar=release/modules/ext/jcl-over-slf4j-1.7.24.jar file.reference.jackson-core-2.9.7.jar=release/modules/ext/jackson-core-2.9.7.jar file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar +file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar +file.reference.jhighlight-1.0.2.jar=release/modules/ext/jhighlight-1.0.2.jar +file.reference.jmatio-1.2.jar=release/modules/ext/jmatio-1.2.jar +file.reference.json-1.8.jar=release/modules/ext/json-1.8.jar +file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar +file.reference.jul-to-slf4j-1.7.24.jar=release/modules/ext/jul-to-slf4j-1.7.24.jar +file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar +file.reference.junrar-0.7.jar=release/modules/ext/junrar-0.7.jar file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar file.reference.mchange-commons-java-0.2.9.jar=release/modules/ext/mchange-commons-java-0.2.9.jar file.reference.metadata-extractor-2.10.1.jar=release/modules/ext/metadata-extractor-2.10.1.jar +file.reference.netcdf4-4.5.5.jar=release/modules/ext/netcdf4-4.5.5.jar +file.reference.opennlp-tools-1.8.3.jar=release/modules/ext/opennlp-tools-1.8.3.jar +file.reference.poi-3.17.jar=release/modules/ext/poi-3.17.jar +file.reference.poi-ooxml-3.17.jar=release/modules/ext/poi-ooxml-3.17.jar +file.reference.poi-scratchpad-3.17.jar=release/modules/ext/poi-scratchpad-3.17.jar file.reference.postgresql-9.4.1211.jre7.jar=release/modules/ext/postgresql-9.4.1211.jre7.jar file.reference.Rejistry-1.0-SNAPSHOT.jar=release/modules/ext/Rejistry-1.0-SNAPSHOT.jar +file.reference.rome-1.5.1.jar=release/modules/ext/rome-1.5.1.jar file.reference.sevenzipjbinding-AllPlatforms.jar=release/modules/ext/sevenzipjbinding-AllPlatforms.jar file.reference.sevenzipjbinding.jar=release/modules/ext/sevenzipjbinding.jar -file.reference.sqlite-jdbc-3.8.11.jar=release\\modules\\ext\\sqlite-jdbc-3.8.11.jar +file.reference.sis-metadata-0.6.jar=release/modules/ext/sis-metadata-0.6.jar +file.reference.sis-netcdf-0.6.jar=release/modules/ext/sis-netcdf-0.6.jar +file.reference.sis-utility-0.6.jar=release/modules/ext/sis-utility-0.6.jar +file.reference.slf4j-api-1.7.24.jar=release/modules/ext/slf4j-api-1.7.24.jar +file.reference.sqlite-jdbc-3.8.11.jar=release/modules/ext/sqlite-jdbc-3.8.11.jar file.reference.StixLib.jar=release/modules/ext/StixLib.jar -file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar -file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar -file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar file.reference.jempbox-1.8.13.jar=release/modules/ext/jempbox-1.8.13.jar file.reference.javax.ws.rs-api-2.0.1.jar=release/modules/ext/javax.ws.rs-api-2.0.1.jar file.reference.cxf-core-3.0.16.jar=release/modules/ext/cxf-core-3.0.16.jar @@ -31,11 +64,14 @@ file.reference.fontbox-2.0.8.jar=release/modules/ext/fontbox-2.0.8.jar file.reference.pdfbox-2.0.8.jar=release/modules/ext/pdfbox-2.0.8.jar file.reference.pdfbox-tools-2.0.8.jar=release/modules/ext/pdfbox-tools-2.0.8.jar file.reference.sleuthkit-postgresql-4.6.4.jar=release/modules/ext/sleuthkit-postgresql-4.6.4.jar +file.reference.tagsoup-1.2.1.jar=release/modules/ext/tagsoup-1.2.1.jar file.reference.tika-core-1.17.jar=release/modules/ext/tika-core-1.17.jar file.reference.tika-parsers-1.17.jar=release/modules/ext/tika-parsers-1.17.jar file.reference.curator-client-2.8.0.jar=release/modules/ext/curator-client-2.8.0.jar file.reference.curator-framework-2.8.0.jar=release/modules/ext/curator-framework-2.8.0.jar file.reference.curator-recipes-2.8.0.jar=release/modules/ext/curator-recipes-2.8.0.jar +file.reference.vorbis-java-core-0.8.jar=release/modules/ext/vorbis-java-core-0.8.jar +file.reference.vorbis-java-tika-0.8.jar=release/modules/ext/vorbis-java-tika-0.8.jar file.reference.xmpcore-5.1.3.jar=release/modules/ext/xmpcore-5.1.3.jar file.reference.xz-1.6.jar=release/modules/ext/xz-1.6.jar file.reference.zookeeper-3.4.6.jar=release/modules/ext/zookeeper-3.4.6.jar diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml index d6f3562663..0e8b544024 100644 --- a/Core/nbproject/project.xml +++ b/Core/nbproject/project.xml @@ -338,81 +338,59 @@ org.sleuthkit.autopsy.modules.vmextractor org.sleuthkit.autopsy.progress org.sleuthkit.autopsy.report + org.sleuthkit.autopsy.textextractors + org.sleuthkit.autopsy.textextractors.extractionconfigs org.sleuthkit.autopsy.texttranslation org.sleuthkit.datamodel + + ext/apache-mime4j-dom-0.8.1.jar + release/modules/ext/apache-mime4j-dom-0.8.1.jar + ext/jackcess-2.2.0.jar release/modules/ext/jackcess-2.2.0.jar - ext/zookeeper-3.4.6.jar - release/modules/ext/zookeeper-3.4.6.jar + ext/jericho-html-3.3.jar + release/modules/ext/jericho-html-3.3.jar - ext/jdom-2.0.5.jar - release/modules/ext/jdom-2.0.5.jar + ext/cdm-4.5.5.jar + release/modules/ext/cdm-4.5.5.jar - ext/cxf-rt-transports-http-3.0.16.jar - release/modules/ext/cxf-rt-transports-http-3.0.16.jar + ext/httpservices-4.5.5.jar + release/modules/ext/httpservices-4.5.5.jar ext/commons-validator-1.6.jar release/modules/ext/commons-validator-1.6.jar - - ext/curator-framework-2.8.0.jar - release/modules/ext/curator-framework-2.8.0.jar - - - ext/bcprov-jdk15on-1.54.jar - release/modules/ext/bcprov-jdk15on-1.54.jar - ext/commons-compress-1.14.jar release/modules/ext/commons-compress-1.14.jar - ext/fontbox-2.0.8.jar - release/modules/ext/fontbox-2.0.8.jar + ext/geoapi-3.0.0.jar + release/modules/ext/geoapi-3.0.0.jar - ext/commons-dbcp2-2.1.1.jar - release\modules\ext\commons-dbcp2-2.1.1.jar - - - ext/jgraphx-v3.8.0.jar - release/modules/ext/jgraphx-v3.8.0.jar - - - ext/jython-standalone-2.7.0.jar - release/modules/ext/jython-standalone-2.7.0.jar + ext/boilerpipe-1.1.0.jar + release/modules/ext/boilerpipe-1.1.0.jar ext/sevenzipjbinding.jar release/modules/ext/sevenzipjbinding.jar - ext/sleuthkit-postgresql-4.6.4.jar - release/modules/ext/sleuthkit-postgresql-4.6.4.jar + ext/bcmail-jdk15on-1.54.jar + release/modules/ext/bcmail-jdk15on-1.54.jar ext/mchange-commons-java-0.2.9.jar release/modules/ext/mchange-commons-java-0.2.9.jar - - ext/cxf-core-3.0.16.jar - release/modules/ext/cxf-core-3.0.16.jar - - - ext/javax.ws.rs-api-2.0.1.jar - release/modules/ext/javax.ws.rs-api-2.0.1.jar - - - ext/postgresql-9.4.1211.jre7.jar - release/modules/ext/postgresql-9.4.1211.jre7.jar - ext/curator-recipes-2.8.0.jar release/modules/ext/curator-recipes-2.8.0.jar @@ -421,6 +399,14 @@ ext/metadata-extractor-2.10.1.jar release/modules/ext/metadata-extractor-2.10.1.jar + + ext/apache-mime4j-core-0.8.1.jar + release/modules/ext/apache-mime4j-core-0.8.1.jar + + + ext/tagsoup-1.2.1.jar + release/modules/ext/tagsoup-1.2.1.jar + ext/tika-core-1.17.jar release/modules/ext/tika-core-1.17.jar @@ -429,45 +415,37 @@ ext/StixLib.jar release/modules/ext/StixLib.jar - - ext/curator-client-2.8.0.jar - release/modules/ext/curator-client-2.8.0.jar - - - ext/jackson-core-2.9.7.jar - release/modules/ext/jackson-core-2.9.7.jar - - - ext/cxf-rt-frontend-jaxrs-3.0.16.jar - release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar - ext/pdfbox-tools-2.0.8.jar release/modules/ext/pdfbox-tools-2.0.8.jar + + ext/asm-5.0.4.jar + release/modules/ext/asm-5.0.4.jar + + + ext/jcl-over-slf4j-1.7.24.jar + release/modules/ext/jcl-over-slf4j-1.7.24.jar + ext/tika-parsers-1.17.jar release/modules/ext/tika-parsers-1.17.jar ext/sqlite-jdbc-3.8.11.jar - release\modules\ext\sqlite-jdbc-3.8.11.jar + release/modules/ext/sqlite-jdbc-3.8.11.jar - ext/activemq-all-5.11.1.jar - release/modules/ext/activemq-all-5.11.1.jar + ext/json-simple-1.1.1.jar + release/modules/ext/json-simple-1.1.1.jar - ext/xz-1.6.jar - release/modules/ext/xz-1.6.jar + ext/sis-utility-0.6.jar + release/modules/ext/sis-utility-0.6.jar - ext/Rejistry-1.0-SNAPSHOT.jar - release/modules/ext/Rejistry-1.0-SNAPSHOT.jar - - - ext/dd-plist-1.20.jar - release/modules/ext/dd-plist-1.20.jar + ext/jhighlight-1.0.2.jar + release/modules/ext/jhighlight-1.0.2.jar ext/jempbox-1.8.13.jar @@ -477,21 +455,9 @@ ext/cxf-rt-rs-client-3.0.16.jar release/modules/ext/cxf-rt-rs-client-3.0.16.jar - - ext/sevenzipjbinding-AllPlatforms.jar - release/modules/ext/sevenzipjbinding-AllPlatforms.jar - ext/commons-pool2-2.4.2.jar - release\modules\ext\commons-pool2-2.4.2.jar - - - ext/jackcess-encrypt-2.1.4.jar - release/modules/ext/jackcess-encrypt-2.1.4.jar - - - ext/jsoup-1.10.3.jar - release/modules/ext/jsoup-1.10.3.jar + release/modules/ext/commons-pool2-2.4.2.jar ext/jdom-2.0.5-contrib.jar @@ -513,6 +479,190 @@ ext/xmpcore-5.1.3.jar release/modules/ext/xmpcore-5.1.3.jar + + ext/zookeeper-3.4.6.jar + release/modules/ext/zookeeper-3.4.6.jar + + + ext/jdom-2.0.5.jar + release/modules/ext/jdom-2.0.5.jar + + + ext/cxf-rt-transports-http-3.0.16.jar + release/modules/ext/cxf-rt-transports-http-3.0.16.jar + + + ext/sis-metadata-0.6.jar + release/modules/ext/sis-metadata-0.6.jar + + + ext/isoparser-1.1.18.jar + release/modules/ext/isoparser-1.1.18.jar + + + ext/sleuthkit-postgresql-4.6.4.jar + release/modules/ext/sleuthkit-postgresql-4.6.4.jar + + + ext/vorbis-java-core-0.8.jar + release/modules/ext/vorbis-java-core-0.8.jar + + + ext/commons-codec-1.6.jar + release/modules/ext/commons-codec-1.6.jar + + + ext/netcdf4-4.5.5.jar + release/modules/ext/netcdf4-4.5.5.jar + + + ext/slf4j-api-1.7.24.jar + release/modules/ext/slf4j-api-1.7.24.jar + + + ext/java-libpst-0.8.1.jar + release/modules/ext/java-libpst-0.8.1.jar + + + ext/jul-to-slf4j-1.7.24.jar + release/modules/ext/jul-to-slf4j-1.7.24.jar + + + ext/gson-2.8.1.jar + release/modules/ext/gson-2.8.1.jar + + + ext/poi-3.17.jar + release/modules/ext/poi-3.17.jar + + + ext/poi-scratchpad-3.17.jar + release/modules/ext/poi-scratchpad-3.17.jar + + + ext/sis-netcdf-0.6.jar + release/modules/ext/sis-netcdf-0.6.jar + + + ext/commons-io-2.5.jar + release/modules/ext/commons-io-2.5.jar + + + ext/curator-framework-2.8.0.jar + release/modules/ext/curator-framework-2.8.0.jar + + + ext/bcprov-jdk15on-1.54.jar + release/modules/ext/bcprov-jdk15on-1.54.jar + + + ext/fontbox-2.0.8.jar + release/modules/ext/fontbox-2.0.8.jar + + + ext/commons-dbcp2-2.1.1.jar + release/modules/ext/commons-dbcp2-2.1.1.jar + + + ext/jgraphx-v3.8.0.jar + release/modules/ext/jgraphx-v3.8.0.jar + + + ext/juniversalchardet-1.0.3.jar + release/modules/ext/juniversalchardet-1.0.3.jar + + + ext/jython-standalone-2.7.0.jar + release/modules/ext/jython-standalone-2.7.0.jar + + + ext/jackcess-encrypt-2.1.4.jar + release/modules/ext/jackcess-encrypt-2.1.4.jar + + + ext/cxf-core-3.0.16.jar + release/modules/ext/cxf-core-3.0.16.jar + + + ext/javax.ws.rs-api-2.0.1.jar + release/modules/ext/javax.ws.rs-api-2.0.1.jar + + + ext/opennlp-tools-1.8.3.jar + release/modules/ext/opennlp-tools-1.8.3.jar + + + ext/junrar-0.7.jar + release/modules/ext/junrar-0.7.jar + + + ext/postgresql-9.4.1211.jre7.jar + release/modules/ext/postgresql-9.4.1211.jre7.jar + + + ext/poi-ooxml-3.17.jar + release/modules/ext/poi-ooxml-3.17.jar + + + ext/curator-client-2.8.0.jar + release/modules/ext/curator-client-2.8.0.jar + + + ext/jackson-core-2.9.7.jar + release/modules/ext/jackson-core-2.9.7.jar + + + ext/cxf-rt-frontend-jaxrs-3.0.16.jar + release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar + + + ext/grib-4.5.5.jar + release/modules/ext/grib-4.5.5.jar + + + ext/jackson-core-2.9.2.jar + release/modules/ext/jackson-core-2.9.2.jar + + + ext/activemq-all-5.11.1.jar + release/modules/ext/activemq-all-5.11.1.jar + + + ext/xz-1.6.jar + release/modules/ext/xz-1.6.jar + + + ext/Rejistry-1.0-SNAPSHOT.jar + release/modules/ext/Rejistry-1.0-SNAPSHOT.jar + + + ext/dd-plist-1.20.jar + release/modules/ext/dd-plist-1.20.jar + + + ext/rome-1.5.1.jar + release/modules/ext/rome-1.5.1.jar + + + ext/sevenzipjbinding-AllPlatforms.jar + release/modules/ext/sevenzipjbinding-AllPlatforms.jar + + + ext/jmatio-1.2.jar + release/modules/ext/jmatio-1.2.jar + + + ext/jsoup-1.10.3.jar + release/modules/ext/jsoup-1.10.3.jar + + + ext/vorbis-java-tika-0.8.jar + release/modules/ext/vorbis-java-tika-0.8.jar + + + ext/json-1.8.jar + release/modules/ext/json-1.8.jar + diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java new file mode 100644 index 0000000000..ba91a6cc3a --- /dev/null +++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java @@ -0,0 +1,89 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2018 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.textextractors; + +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import org.apache.commons.io.IOUtils; +import org.sleuthkit.autopsy.datamodel.ContentUtils; +import org.sleuthkit.datamodel.BlackboardArtifact; +import org.sleuthkit.datamodel.BlackboardAttribute; +import org.sleuthkit.datamodel.Content; +import org.sleuthkit.datamodel.TskCoreException; + +/** + * Extracts text from artifacts by concatenating the values of all of the + * artifact's attributes. + */ +class ArtifactTextExtractor extends TextExtractor { + + private final BlackboardArtifact artifact; + + public ArtifactTextExtractor(Content artifact) { + this.artifact = (BlackboardArtifact) artifact; + } + + @Override + public Reader getReader() throws ExtractionException { + // Concatenate the string values of all attributes into a single + // "content" string to be indexed. + StringBuilder artifactContents = new StringBuilder(); + + Content dataSource = null; + try { + dataSource = artifact.getDataSource(); + } catch (TskCoreException tskCoreException) { + throw new ExtractionException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException); + } + if (dataSource == null) { + throw new ExtractionException("Datasource was null for artifact: " + artifact.toString()); + } + + try { + for (BlackboardAttribute attribute : artifact.getAttributes()) { + artifactContents.append(attribute.getAttributeType().getDisplayName()); + artifactContents.append(" : "); + // We have also discussed modifying BlackboardAttribute.getDisplayString() + // to magically format datetime attributes but that is complicated by + // the fact that BlackboardAttribute exists in Sleuthkit data model + // while the utility to determine the timezone to use is in ContentUtils + // in the Autopsy datamodel. + switch (attribute.getValueType()) { + case DATETIME: + artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource)); + break; + default: + artifactContents.append(attribute.getDisplayString()); + } + artifactContents.append(System.lineSeparator()); + } + } catch (TskCoreException tskCoreException) { + throw new ExtractionException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException); + } + + return new InputStreamReader(IOUtils.toInputStream(artifactContents, + StandardCharsets.UTF_8), StandardCharsets.UTF_8); + } + + @Override + public boolean isSupported(Content file, String detectedFormat) { + return true; + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java similarity index 81% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java index 32842dbc03..86dbd15c1b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; import java.io.IOException; import java.io.Reader; @@ -38,10 +38,11 @@ import org.sleuthkit.datamodel.ReadContentInputStream; /** * Extracts text from HTML content. */ -class HtmlTextExtractor extends ContentTextExtractor { +final class HtmlTextExtractor extends TextExtractor { static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); - private static final int MAX_SIZE = 50_000_000; //50MB + private final int MAX_SIZE; + private final Content file; static final List WEB_MIME_TYPES = Arrays.asList( "application/javascript", //NON-NLS @@ -51,27 +52,51 @@ class HtmlTextExtractor extends ContentTextExtractor { "text/html", //NON-NLS NON-NLS "text/javascript" //NON-NLS ); - + static { // Disable Jericho HTML Parser log messages. Config.LoggerProvider = LoggerProvider.DISABLED; } - @Override - boolean isContentTypeSpecific() { - return true; + /** + * Creates a default instance of the HtmlTextExtractor. Supported file size + * is 50MB. + */ + public HtmlTextExtractor(Content file) { + //Set default to be 50 MB. + MAX_SIZE = 50_000_000; + this.file = file; } + /** + * Determines if this content type is supported by this extractor. + * + * @param content Content instance to be analyzed + * @param detectedFormat Mimetype of content instance + * + * @return flag indicating support + */ @Override - boolean isSupported(Content content, String detectedFormat) { + public boolean isSupported(Content content, String detectedFormat) { return detectedFormat != null && WEB_MIME_TYPES.contains(detectedFormat) && content.getSize() <= MAX_SIZE; } + /** + * Returns a reader that will iterate over the text of an HTML document. + * + * @param content Html document source + * + * @return A reader instance containing the document source text + * + * @throws TextExtractorException + */ @Override - public Reader getReader(Content content) throws TextExtractorException { - ReadContentInputStream stream = new ReadContentInputStream(content); + public Reader getReader() throws ExtractionException { + //TODO JIRA-4467, there is only harm in excluding HTML documents greater + //than 50MB due to our troubled approach of extraction. + ReadContentInputStream stream = new ReadContentInputStream(file); //Parse the stream with Jericho and put the results in a Reader try { @@ -164,17 +189,8 @@ class HtmlTextExtractor extends ContentTextExtractor { // All done, now make it a reader return new StringReader(stringBuilder.toString()); } catch (IOException ex) { - throw new TextExtractorException("Error extracting HTML from content.", ex); + logger.log(Level.WARNING, "Error extracting HTML from content.", ex); + throw new ExtractionException("Error extracting HTML from content.", ex); } } - - @Override - public boolean isDisabled() { - return false; - } - - @Override - public void logWarning(final String msg, Exception ex) { - logger.log(Level.WARNING, msg, ex); //NON-NLS } - } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java similarity index 80% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java index f7fff3c134..ea204d5e30 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java @@ -1,24 +1,23 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2018-2018 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. +/* + * Autopsy Forensic Browser + * + * Copyright 2018-2018 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; -import com.google.common.io.CharSource; import java.io.IOException; import java.io.Reader; import java.util.Iterator; @@ -28,37 +27,27 @@ import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.SQLiteTableReader; -import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.AbstractFile; +import org.sleuthkit.datamodel.Content; /** - * Dedicated SqliteTextExtractor to solve the problems associated with Tika's - * Sqlite parser. + * Extracts text from SQLite database files. * - * Tika problems: 1) Tika fails to open virtual tables 2) Tika fails to open - * tables with spaces in table name 3) Tika fails to include the table names in - * output (except for the first table it parses) + * This is a dedicated solution to address the problems associated with + * Tika's sqlite parser (version 1.17), which include the following: + * 1) Virtual tables cause the parser to bail + * 2) Tables that contain spaces in their name are not extracted + * 3) Table names are not included in its output text */ -class SqliteTextExtractor extends ContentTextExtractor { +final class SqliteTextExtractor extends TextExtractor { private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); + private final AbstractFile file; - @Override - boolean isContentTypeSpecific() { - return true; + public SqliteTextExtractor(Content file) { + this.file = (AbstractFile) file; } - - @Override - public boolean isDisabled() { - return false; - } - - @Override - public void logWarning(String msg, Exception exception) { - logger.log(Level.WARNING, msg, exception); //NON-NLS - } - /** * Supports only the sqlite mimetypes * @@ -68,44 +57,34 @@ class SqliteTextExtractor extends ContentTextExtractor { * @return true if x-sqlite3 */ @Override - boolean isSupported(Content file, String detectedFormat) { + public boolean isSupported(Content file, String detectedFormat) { return SQLITE_MIMETYPE.equals(detectedFormat); } /** - * Returns a stream that will read from a sqlite database. + * Returns a reader that will iterate over the text of a sqlite database. * * @param source Content file * - * @return An InputStream that reads from a Sqlite database. + * @return An InputStream that reads from a Sqlite database * - * @throws - * org.sleuthkit.autopsy.keywordsearch.TextExtractor.TextExtractorException + * @throws TextExtractorException */ @Override - public Reader getReader(Content source) throws TextExtractorException { - //Firewall for any content that is not an AbstractFile - if (!AbstractFile.class.isInstance(source)) { - try { - return CharSource.wrap("").openStream(); - } catch (IOException ex) { - throw new TextExtractorException("", ex); - } - } - - return new SQLiteStreamReader((AbstractFile) source); + public Reader getReader() throws ExtractionException { + return new SQLiteStreamReader(file); } - + /** * Produces a continuous stream of characters from a database file. To * achieve this, all table names are queues up and a SQLiteTableReader is * used to do the actual queries and table iteration. */ - public class SQLiteStreamReader extends Reader { + private class SQLiteStreamReader extends Reader { private final SQLiteTableReader reader; private final AbstractFile file; - + private Iterator tableNames; private String currentTableName; @@ -217,9 +196,10 @@ class SqliteTextExtractor extends ContentTextExtractor { } /** - * Reads database values into the buffer. This function is responsible for - * getting the next table in the queue, initiating calls to the SQLiteTableReader, - * and filling in any excess bytes that are lingering from the previous call. + * Reads database values into the buffer. This function is responsible + * for getting the next table in the queue, initiating calls to the + * SQLiteTableReader, and filling in any excess bytes that are lingering + * from the previous call. * * @throws IOException */ @@ -255,9 +235,9 @@ class SqliteTextExtractor extends ContentTextExtractor { reader.read(currentTableName, () -> bufIndex == len); } catch (SQLiteTableReaderException ex) { logger.log(Level.WARNING, String.format( - "Error attempting to read file table: [%s]" //NON-NLS - + " for file: [%s] (id=%d).", currentTableName, //NON-NLS - file.getName(), file.getId()), ex.getMessage()); + "Error attempting to read file table: [%s]" //NON-NLS + + " for file: [%s] (id=%d).", currentTableName, //NON-NLS + file.getName(), file.getId()), ex.getMessage()); } } else { if (bufIndex == off) { @@ -290,8 +270,8 @@ class SqliteTextExtractor extends ContentTextExtractor { } /** - * Wrapper that holds the excess bytes that were left over from the previous - * call to read(). + * Wrapper that holds the excess bytes that were left over from the + * previous call to read(). */ private class ExcessBytes { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java similarity index 85% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java index 391c7d5a7c..899cec9ef2 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java @@ -16,19 +16,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.Charset; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.logging.Level; -import org.sleuthkit.autopsy.coreutils.Logger; +import java.util.Objects; +import org.openide.util.Lookup; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; +import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskException; @@ -36,24 +36,25 @@ import org.sleuthkit.datamodel.TskException; /** * Extracts raw strings from content. */ -class StringsTextExtractor extends ContentTextExtractor { +final class StringsTextExtractor extends TextExtractor { - static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); - - /** - * Options for this extractor - */ - enum ExtractOptions { - EXTRACT_UTF16, ///< extract UTF16 text, true/false - EXTRACT_UTF8, ///< extract UTF8 text, true/false - }; + private boolean extractUTF8; + private boolean extractUTF16; + private final Content content; + private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8"; private final List