Merge pull request #4330 from dannysmyda/4425-text-abstraction-impl

4425 - Move TextExtractors out of KWS and into Core.
This commit is contained in:
Richard Cordovano 2018-12-10 16:49:27 -05:00 committed by GitHub
commit 14d6c52e09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 1233 additions and 754 deletions

View File

@ -35,6 +35,11 @@
<dependency conf="core->default" org="com.fasterxml.jackson.core" name="jackson-core" rev="2.9.7"/> <dependency conf="core->default" org="com.fasterxml.jackson.core" name="jackson-core" rev="2.9.7"/>
<dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/> <dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/>
<dependency conf="core->default" org="net.htmlparser.jericho" name="jericho-html" rev="3.3"/>
<!-- Tika 1.14 seems to declare a (transitive?) dependency on cleartk-util 3.2.2, but the most recent
version available is 2.0.0 Overriding the version worked-->
<override org="org.cleartk" module="cleartk-util" rev="2.0.0"/>
</dependencies> </dependencies>
</ivy-module> </ivy-module>

View File

@ -1,26 +1,59 @@
file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar
file.reference.apache-mime4j-core-0.8.1.jar=release/modules/ext/apache-mime4j-core-0.8.1.jar
file.reference.apache-mime4j-dom-0.8.1.jar=release/modules/ext/apache-mime4j-dom-0.8.1.jar
file.reference.asm-5.0.4.jar=release/modules/ext/asm-5.0.4.jar
file.reference.bcmail-jdk15on-1.54.jar=release/modules/ext/bcmail-jdk15on-1.54.jar
file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
file.reference.boilerpipe-1.1.0.jar=release/modules/ext/boilerpipe-1.1.0.jar
file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar
file.reference.cdm-4.5.5.jar=release/modules/ext/cdm-4.5.5.jar
file.reference.commons-codec-1.6.jar=release/modules/ext/commons-codec-1.6.jar
file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar
file.reference.commons-dbcp2-2.1.1.jar=release\\modules\\ext\\commons-dbcp2-2.1.1.jar file.reference.commons-dbcp2-2.1.1.jar=release/modules/ext/commons-dbcp2-2.1.1.jar
file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4.2.jar file.reference.commons-io-2.5.jar=release/modules/ext/commons-io-2.5.jar
file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar
file.reference.geoapi-3.0.0.jar=release/modules/ext/geoapi-3.0.0.jar
file.reference.grib-4.5.5.jar=release/modules/ext/grib-4.5.5.jar
file.reference.gson-2.8.1.jar=release/modules/ext/gson-2.8.1.jar
file.reference.httpservices-4.5.5.jar=release/modules/ext/httpservices-4.5.5.jar
file.reference.isoparser-1.1.18.jar=release/modules/ext/isoparser-1.1.18.jar
file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
file.reference.java-libpst-0.8.1.jar=release/modules/ext/java-libpst-0.8.1.jar
file.reference.jcl-over-slf4j-1.7.24.jar=release/modules/ext/jcl-over-slf4j-1.7.24.jar
file.reference.jackson-core-2.9.7.jar=release/modules/ext/jackson-core-2.9.7.jar file.reference.jackson-core-2.9.7.jar=release/modules/ext/jackson-core-2.9.7.jar
file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar
file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar
file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar
file.reference.jhighlight-1.0.2.jar=release/modules/ext/jhighlight-1.0.2.jar
file.reference.jmatio-1.2.jar=release/modules/ext/jmatio-1.2.jar
file.reference.json-1.8.jar=release/modules/ext/json-1.8.jar
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar
file.reference.jul-to-slf4j-1.7.24.jar=release/modules/ext/jul-to-slf4j-1.7.24.jar
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
file.reference.junrar-0.7.jar=release/modules/ext/junrar-0.7.jar
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
file.reference.mchange-commons-java-0.2.9.jar=release/modules/ext/mchange-commons-java-0.2.9.jar file.reference.mchange-commons-java-0.2.9.jar=release/modules/ext/mchange-commons-java-0.2.9.jar
file.reference.metadata-extractor-2.10.1.jar=release/modules/ext/metadata-extractor-2.10.1.jar file.reference.metadata-extractor-2.10.1.jar=release/modules/ext/metadata-extractor-2.10.1.jar
file.reference.netcdf4-4.5.5.jar=release/modules/ext/netcdf4-4.5.5.jar
file.reference.opennlp-tools-1.8.3.jar=release/modules/ext/opennlp-tools-1.8.3.jar
file.reference.poi-3.17.jar=release/modules/ext/poi-3.17.jar
file.reference.poi-ooxml-3.17.jar=release/modules/ext/poi-ooxml-3.17.jar
file.reference.poi-scratchpad-3.17.jar=release/modules/ext/poi-scratchpad-3.17.jar
file.reference.postgresql-9.4.1211.jre7.jar=release/modules/ext/postgresql-9.4.1211.jre7.jar file.reference.postgresql-9.4.1211.jre7.jar=release/modules/ext/postgresql-9.4.1211.jre7.jar
file.reference.Rejistry-1.0-SNAPSHOT.jar=release/modules/ext/Rejistry-1.0-SNAPSHOT.jar file.reference.Rejistry-1.0-SNAPSHOT.jar=release/modules/ext/Rejistry-1.0-SNAPSHOT.jar
file.reference.rome-1.5.1.jar=release/modules/ext/rome-1.5.1.jar
file.reference.sevenzipjbinding-AllPlatforms.jar=release/modules/ext/sevenzipjbinding-AllPlatforms.jar file.reference.sevenzipjbinding-AllPlatforms.jar=release/modules/ext/sevenzipjbinding-AllPlatforms.jar
file.reference.sevenzipjbinding.jar=release/modules/ext/sevenzipjbinding.jar file.reference.sevenzipjbinding.jar=release/modules/ext/sevenzipjbinding.jar
file.reference.sqlite-jdbc-3.8.11.jar=release\\modules\\ext\\sqlite-jdbc-3.8.11.jar file.reference.sis-metadata-0.6.jar=release/modules/ext/sis-metadata-0.6.jar
file.reference.sis-netcdf-0.6.jar=release/modules/ext/sis-netcdf-0.6.jar
file.reference.sis-utility-0.6.jar=release/modules/ext/sis-utility-0.6.jar
file.reference.slf4j-api-1.7.24.jar=release/modules/ext/slf4j-api-1.7.24.jar
file.reference.sqlite-jdbc-3.8.11.jar=release/modules/ext/sqlite-jdbc-3.8.11.jar
file.reference.StixLib.jar=release/modules/ext/StixLib.jar file.reference.StixLib.jar=release/modules/ext/StixLib.jar
file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
file.reference.jempbox-1.8.13.jar=release/modules/ext/jempbox-1.8.13.jar file.reference.jempbox-1.8.13.jar=release/modules/ext/jempbox-1.8.13.jar
file.reference.javax.ws.rs-api-2.0.1.jar=release/modules/ext/javax.ws.rs-api-2.0.1.jar file.reference.javax.ws.rs-api-2.0.1.jar=release/modules/ext/javax.ws.rs-api-2.0.1.jar
file.reference.cxf-core-3.0.16.jar=release/modules/ext/cxf-core-3.0.16.jar file.reference.cxf-core-3.0.16.jar=release/modules/ext/cxf-core-3.0.16.jar
@ -31,11 +64,14 @@ file.reference.fontbox-2.0.8.jar=release/modules/ext/fontbox-2.0.8.jar
file.reference.pdfbox-2.0.8.jar=release/modules/ext/pdfbox-2.0.8.jar file.reference.pdfbox-2.0.8.jar=release/modules/ext/pdfbox-2.0.8.jar
file.reference.pdfbox-tools-2.0.8.jar=release/modules/ext/pdfbox-tools-2.0.8.jar file.reference.pdfbox-tools-2.0.8.jar=release/modules/ext/pdfbox-tools-2.0.8.jar
file.reference.sleuthkit-postgresql-4.6.4.jar=release/modules/ext/sleuthkit-postgresql-4.6.4.jar file.reference.sleuthkit-postgresql-4.6.4.jar=release/modules/ext/sleuthkit-postgresql-4.6.4.jar
file.reference.tagsoup-1.2.1.jar=release/modules/ext/tagsoup-1.2.1.jar
file.reference.tika-core-1.17.jar=release/modules/ext/tika-core-1.17.jar file.reference.tika-core-1.17.jar=release/modules/ext/tika-core-1.17.jar
file.reference.tika-parsers-1.17.jar=release/modules/ext/tika-parsers-1.17.jar file.reference.tika-parsers-1.17.jar=release/modules/ext/tika-parsers-1.17.jar
file.reference.curator-client-2.8.0.jar=release/modules/ext/curator-client-2.8.0.jar file.reference.curator-client-2.8.0.jar=release/modules/ext/curator-client-2.8.0.jar
file.reference.curator-framework-2.8.0.jar=release/modules/ext/curator-framework-2.8.0.jar file.reference.curator-framework-2.8.0.jar=release/modules/ext/curator-framework-2.8.0.jar
file.reference.curator-recipes-2.8.0.jar=release/modules/ext/curator-recipes-2.8.0.jar file.reference.curator-recipes-2.8.0.jar=release/modules/ext/curator-recipes-2.8.0.jar
file.reference.vorbis-java-core-0.8.jar=release/modules/ext/vorbis-java-core-0.8.jar
file.reference.vorbis-java-tika-0.8.jar=release/modules/ext/vorbis-java-tika-0.8.jar
file.reference.xmpcore-5.1.3.jar=release/modules/ext/xmpcore-5.1.3.jar file.reference.xmpcore-5.1.3.jar=release/modules/ext/xmpcore-5.1.3.jar
file.reference.xz-1.6.jar=release/modules/ext/xz-1.6.jar file.reference.xz-1.6.jar=release/modules/ext/xz-1.6.jar
file.reference.zookeeper-3.4.6.jar=release/modules/ext/zookeeper-3.4.6.jar file.reference.zookeeper-3.4.6.jar=release/modules/ext/zookeeper-3.4.6.jar

View File

@ -338,81 +338,59 @@
<package>org.sleuthkit.autopsy.modules.vmextractor</package> <package>org.sleuthkit.autopsy.modules.vmextractor</package>
<package>org.sleuthkit.autopsy.progress</package> <package>org.sleuthkit.autopsy.progress</package>
<package>org.sleuthkit.autopsy.report</package> <package>org.sleuthkit.autopsy.report</package>
<package>org.sleuthkit.autopsy.textextractors</package>
<package>org.sleuthkit.autopsy.textextractors.extractionconfigs</package>
<package>org.sleuthkit.autopsy.texttranslation</package> <package>org.sleuthkit.autopsy.texttranslation</package>
<package>org.sleuthkit.datamodel</package> <package>org.sleuthkit.datamodel</package>
</public-packages> </public-packages>
<class-path-extension>
<runtime-relative-path>ext/apache-mime4j-dom-0.8.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/apache-mime4j-dom-0.8.1.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/jackcess-2.2.0.jar</runtime-relative-path> <runtime-relative-path>ext/jackcess-2.2.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jackcess-2.2.0.jar</binary-origin> <binary-origin>release/modules/ext/jackcess-2.2.0.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path> <runtime-relative-path>ext/jericho-html-3.3.jar</runtime-relative-path>
<binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin> <binary-origin>release/modules/ext/jericho-html-3.3.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/jdom-2.0.5.jar</runtime-relative-path> <runtime-relative-path>ext/cdm-4.5.5.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jdom-2.0.5.jar</binary-origin> <binary-origin>release/modules/ext/cdm-4.5.5.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/cxf-rt-transports-http-3.0.16.jar</runtime-relative-path> <runtime-relative-path>ext/httpservices-4.5.5.jar</runtime-relative-path>
<binary-origin>release/modules/ext/cxf-rt-transports-http-3.0.16.jar</binary-origin> <binary-origin>release/modules/ext/httpservices-4.5.5.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/commons-validator-1.6.jar</runtime-relative-path> <runtime-relative-path>ext/commons-validator-1.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/commons-validator-1.6.jar</binary-origin> <binary-origin>release/modules/ext/commons-validator-1.6.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/curator-framework-2.8.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/curator-framework-2.8.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/bcprov-jdk15on-1.54.jar</runtime-relative-path>
<binary-origin>release/modules/ext/bcprov-jdk15on-1.54.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/commons-compress-1.14.jar</runtime-relative-path> <runtime-relative-path>ext/commons-compress-1.14.jar</runtime-relative-path>
<binary-origin>release/modules/ext/commons-compress-1.14.jar</binary-origin> <binary-origin>release/modules/ext/commons-compress-1.14.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/fontbox-2.0.8.jar</runtime-relative-path> <runtime-relative-path>ext/geoapi-3.0.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/fontbox-2.0.8.jar</binary-origin> <binary-origin>release/modules/ext/geoapi-3.0.0.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/commons-dbcp2-2.1.1.jar</runtime-relative-path> <runtime-relative-path>ext/boilerpipe-1.1.0.jar</runtime-relative-path>
<binary-origin>release\modules\ext\commons-dbcp2-2.1.1.jar</binary-origin> <binary-origin>release/modules/ext/boilerpipe-1.1.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jgraphx-v3.8.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jgraphx-v3.8.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jython-standalone-2.7.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jython-standalone-2.7.0.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/sevenzipjbinding.jar</runtime-relative-path> <runtime-relative-path>ext/sevenzipjbinding.jar</runtime-relative-path>
<binary-origin>release/modules/ext/sevenzipjbinding.jar</binary-origin> <binary-origin>release/modules/ext/sevenzipjbinding.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path> <runtime-relative-path>ext/bcmail-jdk15on-1.54.jar</runtime-relative-path>
<binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin> <binary-origin>release/modules/ext/bcmail-jdk15on-1.54.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/mchange-commons-java-0.2.9.jar</runtime-relative-path> <runtime-relative-path>ext/mchange-commons-java-0.2.9.jar</runtime-relative-path>
<binary-origin>release/modules/ext/mchange-commons-java-0.2.9.jar</binary-origin> <binary-origin>release/modules/ext/mchange-commons-java-0.2.9.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/cxf-core-3.0.16.jar</runtime-relative-path>
<binary-origin>release/modules/ext/cxf-core-3.0.16.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/javax.ws.rs-api-2.0.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/javax.ws.rs-api-2.0.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/postgresql-9.4.1211.jre7.jar</runtime-relative-path>
<binary-origin>release/modules/ext/postgresql-9.4.1211.jre7.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/curator-recipes-2.8.0.jar</runtime-relative-path> <runtime-relative-path>ext/curator-recipes-2.8.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/curator-recipes-2.8.0.jar</binary-origin> <binary-origin>release/modules/ext/curator-recipes-2.8.0.jar</binary-origin>
@ -421,6 +399,14 @@
<runtime-relative-path>ext/metadata-extractor-2.10.1.jar</runtime-relative-path> <runtime-relative-path>ext/metadata-extractor-2.10.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/metadata-extractor-2.10.1.jar</binary-origin> <binary-origin>release/modules/ext/metadata-extractor-2.10.1.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/apache-mime4j-core-0.8.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/apache-mime4j-core-0.8.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/tagsoup-1.2.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/tagsoup-1.2.1.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/tika-core-1.17.jar</runtime-relative-path> <runtime-relative-path>ext/tika-core-1.17.jar</runtime-relative-path>
<binary-origin>release/modules/ext/tika-core-1.17.jar</binary-origin> <binary-origin>release/modules/ext/tika-core-1.17.jar</binary-origin>
@ -429,45 +415,37 @@
<runtime-relative-path>ext/StixLib.jar</runtime-relative-path> <runtime-relative-path>ext/StixLib.jar</runtime-relative-path>
<binary-origin>release/modules/ext/StixLib.jar</binary-origin> <binary-origin>release/modules/ext/StixLib.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/curator-client-2.8.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/curator-client-2.8.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jackson-core-2.9.7.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jackson-core-2.9.7.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/cxf-rt-frontend-jaxrs-3.0.16.jar</runtime-relative-path>
<binary-origin>release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/pdfbox-tools-2.0.8.jar</runtime-relative-path> <runtime-relative-path>ext/pdfbox-tools-2.0.8.jar</runtime-relative-path>
<binary-origin>release/modules/ext/pdfbox-tools-2.0.8.jar</binary-origin> <binary-origin>release/modules/ext/pdfbox-tools-2.0.8.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/asm-5.0.4.jar</runtime-relative-path>
<binary-origin>release/modules/ext/asm-5.0.4.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jcl-over-slf4j-1.7.24.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jcl-over-slf4j-1.7.24.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/tika-parsers-1.17.jar</runtime-relative-path> <runtime-relative-path>ext/tika-parsers-1.17.jar</runtime-relative-path>
<binary-origin>release/modules/ext/tika-parsers-1.17.jar</binary-origin> <binary-origin>release/modules/ext/tika-parsers-1.17.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/sqlite-jdbc-3.8.11.jar</runtime-relative-path> <runtime-relative-path>ext/sqlite-jdbc-3.8.11.jar</runtime-relative-path>
<binary-origin>release\modules\ext\sqlite-jdbc-3.8.11.jar</binary-origin> <binary-origin>release/modules/ext/sqlite-jdbc-3.8.11.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/activemq-all-5.11.1.jar</runtime-relative-path> <runtime-relative-path>ext/json-simple-1.1.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/activemq-all-5.11.1.jar</binary-origin> <binary-origin>release/modules/ext/json-simple-1.1.1.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/xz-1.6.jar</runtime-relative-path> <runtime-relative-path>ext/sis-utility-0.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/xz-1.6.jar</binary-origin> <binary-origin>release/modules/ext/sis-utility-0.6.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/Rejistry-1.0-SNAPSHOT.jar</runtime-relative-path> <runtime-relative-path>ext/jhighlight-1.0.2.jar</runtime-relative-path>
<binary-origin>release/modules/ext/Rejistry-1.0-SNAPSHOT.jar</binary-origin> <binary-origin>release/modules/ext/jhighlight-1.0.2.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/dd-plist-1.20.jar</runtime-relative-path>
<binary-origin>release/modules/ext/dd-plist-1.20.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/jempbox-1.8.13.jar</runtime-relative-path> <runtime-relative-path>ext/jempbox-1.8.13.jar</runtime-relative-path>
@ -477,21 +455,9 @@
<runtime-relative-path>ext/cxf-rt-rs-client-3.0.16.jar</runtime-relative-path> <runtime-relative-path>ext/cxf-rt-rs-client-3.0.16.jar</runtime-relative-path>
<binary-origin>release/modules/ext/cxf-rt-rs-client-3.0.16.jar</binary-origin> <binary-origin>release/modules/ext/cxf-rt-rs-client-3.0.16.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/sevenzipjbinding-AllPlatforms.jar</runtime-relative-path>
<binary-origin>release/modules/ext/sevenzipjbinding-AllPlatforms.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/commons-pool2-2.4.2.jar</runtime-relative-path> <runtime-relative-path>ext/commons-pool2-2.4.2.jar</runtime-relative-path>
<binary-origin>release\modules\ext\commons-pool2-2.4.2.jar</binary-origin> <binary-origin>release/modules/ext/commons-pool2-2.4.2.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jsoup-1.10.3.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jsoup-1.10.3.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/jdom-2.0.5-contrib.jar</runtime-relative-path> <runtime-relative-path>ext/jdom-2.0.5-contrib.jar</runtime-relative-path>
@ -513,6 +479,190 @@
<runtime-relative-path>ext/xmpcore-5.1.3.jar</runtime-relative-path> <runtime-relative-path>ext/xmpcore-5.1.3.jar</runtime-relative-path>
<binary-origin>release/modules/ext/xmpcore-5.1.3.jar</binary-origin> <binary-origin>release/modules/ext/xmpcore-5.1.3.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jdom-2.0.5.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jdom-2.0.5.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/cxf-rt-transports-http-3.0.16.jar</runtime-relative-path>
<binary-origin>release/modules/ext/cxf-rt-transports-http-3.0.16.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/sis-metadata-0.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/sis-metadata-0.6.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/isoparser-1.1.18.jar</runtime-relative-path>
<binary-origin>release/modules/ext/isoparser-1.1.18.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path>
<binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/vorbis-java-core-0.8.jar</runtime-relative-path>
<binary-origin>release/modules/ext/vorbis-java-core-0.8.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/commons-codec-1.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/commons-codec-1.6.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/netcdf4-4.5.5.jar</runtime-relative-path>
<binary-origin>release/modules/ext/netcdf4-4.5.5.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/slf4j-api-1.7.24.jar</runtime-relative-path>
<binary-origin>release/modules/ext/slf4j-api-1.7.24.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/java-libpst-0.8.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/java-libpst-0.8.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jul-to-slf4j-1.7.24.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jul-to-slf4j-1.7.24.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/gson-2.8.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/gson-2.8.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/poi-3.17.jar</runtime-relative-path>
<binary-origin>release/modules/ext/poi-3.17.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/poi-scratchpad-3.17.jar</runtime-relative-path>
<binary-origin>release/modules/ext/poi-scratchpad-3.17.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/sis-netcdf-0.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/sis-netcdf-0.6.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/commons-io-2.5.jar</runtime-relative-path>
<binary-origin>release/modules/ext/commons-io-2.5.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/curator-framework-2.8.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/curator-framework-2.8.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/bcprov-jdk15on-1.54.jar</runtime-relative-path>
<binary-origin>release/modules/ext/bcprov-jdk15on-1.54.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/fontbox-2.0.8.jar</runtime-relative-path>
<binary-origin>release/modules/ext/fontbox-2.0.8.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/commons-dbcp2-2.1.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/commons-dbcp2-2.1.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jgraphx-v3.8.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jgraphx-v3.8.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/juniversalchardet-1.0.3.jar</runtime-relative-path>
<binary-origin>release/modules/ext/juniversalchardet-1.0.3.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jython-standalone-2.7.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jython-standalone-2.7.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/cxf-core-3.0.16.jar</runtime-relative-path>
<binary-origin>release/modules/ext/cxf-core-3.0.16.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/javax.ws.rs-api-2.0.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/javax.ws.rs-api-2.0.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/opennlp-tools-1.8.3.jar</runtime-relative-path>
<binary-origin>release/modules/ext/opennlp-tools-1.8.3.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/junrar-0.7.jar</runtime-relative-path>
<binary-origin>release/modules/ext/junrar-0.7.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/postgresql-9.4.1211.jre7.jar</runtime-relative-path>
<binary-origin>release/modules/ext/postgresql-9.4.1211.jre7.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/poi-ooxml-3.17.jar</runtime-relative-path>
<binary-origin>release/modules/ext/poi-ooxml-3.17.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/curator-client-2.8.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/curator-client-2.8.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jackson-core-2.9.7.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jackson-core-2.9.7.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/cxf-rt-frontend-jaxrs-3.0.16.jar</runtime-relative-path>
<binary-origin>release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/grib-4.5.5.jar</runtime-relative-path>
<binary-origin>release/modules/ext/grib-4.5.5.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jackson-core-2.9.2.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jackson-core-2.9.2.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/activemq-all-5.11.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/activemq-all-5.11.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/xz-1.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/xz-1.6.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/Rejistry-1.0-SNAPSHOT.jar</runtime-relative-path>
<binary-origin>release/modules/ext/Rejistry-1.0-SNAPSHOT.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/dd-plist-1.20.jar</runtime-relative-path>
<binary-origin>release/modules/ext/dd-plist-1.20.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/rome-1.5.1.jar</runtime-relative-path>
<binary-origin>release/modules/ext/rome-1.5.1.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/sevenzipjbinding-AllPlatforms.jar</runtime-relative-path>
<binary-origin>release/modules/ext/sevenzipjbinding-AllPlatforms.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jmatio-1.2.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jmatio-1.2.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jsoup-1.10.3.jar</runtime-relative-path>
<binary-origin>release/modules/ext/jsoup-1.10.3.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
<binary-origin>release/modules/ext/vorbis-java-tika-0.8.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/json-1.8.jar</runtime-relative-path>
<binary-origin>release/modules/ext/json-1.8.jar</binary-origin>
</class-path-extension>
</data> </data>
</configuration> </configuration>
</project> </project>

View File

@ -0,0 +1,89 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Extracts text from artifacts by concatenating the values of all of the
* artifact's attributes.
*/
class ArtifactTextExtractor extends TextExtractor {
private final BlackboardArtifact artifact;
public ArtifactTextExtractor(Content artifact) {
this.artifact = (BlackboardArtifact) artifact;
}
@Override
public Reader getReader() throws ExtractionException {
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
Content dataSource = null;
try {
dataSource = artifact.getDataSource();
} catch (TskCoreException tskCoreException) {
throw new ExtractionException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
}
if (dataSource == null) {
throw new ExtractionException("Datasource was null for artifact: " + artifact.toString());
}
try {
for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : ");
// We have also discussed modifying BlackboardAttribute.getDisplayString()
// to magically format datetime attributes but that is complicated by
// the fact that BlackboardAttribute exists in Sleuthkit data model
// while the utility to determine the timezone to use is in ContentUtils
// in the Autopsy datamodel.
switch (attribute.getValueType()) {
case DATETIME:
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
break;
default:
artifactContents.append(attribute.getDisplayString());
}
artifactContents.append(System.lineSeparator());
}
} catch (TskCoreException tskCoreException) {
throw new ExtractionException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
}
return new InputStreamReader(IOUtils.toInputStream(artifactContents,
StandardCharsets.UTF_8), StandardCharsets.UTF_8);
}
@Override
public boolean isSupported(Content file, String detectedFormat) {
return true;
}
}

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.textextractors;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -38,10 +38,11 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
/** /**
* Extracts text from HTML content. * Extracts text from HTML content.
*/ */
class HtmlTextExtractor extends ContentTextExtractor { final class HtmlTextExtractor extends TextExtractor {
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private static final int MAX_SIZE = 50_000_000; //50MB private final int MAX_SIZE;
private final Content file;
static final List<String> WEB_MIME_TYPES = Arrays.asList( static final List<String> WEB_MIME_TYPES = Arrays.asList(
"application/javascript", //NON-NLS "application/javascript", //NON-NLS
@ -51,27 +52,51 @@ class HtmlTextExtractor extends ContentTextExtractor {
"text/html", //NON-NLS NON-NLS "text/html", //NON-NLS NON-NLS
"text/javascript" //NON-NLS "text/javascript" //NON-NLS
); );
static { static {
// Disable Jericho HTML Parser log messages. // Disable Jericho HTML Parser log messages.
Config.LoggerProvider = LoggerProvider.DISABLED; Config.LoggerProvider = LoggerProvider.DISABLED;
} }
@Override /**
boolean isContentTypeSpecific() { * Creates a default instance of the HtmlTextExtractor. Supported file size
return true; * is 50MB.
*/
public HtmlTextExtractor(Content file) {
//Set default to be 50 MB.
MAX_SIZE = 50_000_000;
this.file = file;
} }
/**
* Determines if this content type is supported by this extractor.
*
* @param content Content instance to be analyzed
* @param detectedFormat Mimetype of content instance
*
* @return flag indicating support
*/
@Override @Override
boolean isSupported(Content content, String detectedFormat) { public boolean isSupported(Content content, String detectedFormat) {
return detectedFormat != null return detectedFormat != null
&& WEB_MIME_TYPES.contains(detectedFormat) && WEB_MIME_TYPES.contains(detectedFormat)
&& content.getSize() <= MAX_SIZE; && content.getSize() <= MAX_SIZE;
} }
/**
* Returns a reader that will iterate over the text of an HTML document.
*
* @param content Html document source
*
* @return A reader instance containing the document source text
*
* @throws TextExtractorException
*/
@Override @Override
public Reader getReader(Content content) throws TextExtractorException { public Reader getReader() throws ExtractionException {
ReadContentInputStream stream = new ReadContentInputStream(content); //TODO JIRA-4467, there is only harm in excluding HTML documents greater
//than 50MB due to our troubled approach of extraction.
ReadContentInputStream stream = new ReadContentInputStream(file);
//Parse the stream with Jericho and put the results in a Reader //Parse the stream with Jericho and put the results in a Reader
try { try {
@ -164,17 +189,8 @@ class HtmlTextExtractor extends ContentTextExtractor {
// All done, now make it a reader // All done, now make it a reader
return new StringReader(stringBuilder.toString()); return new StringReader(stringBuilder.toString());
} catch (IOException ex) { } catch (IOException ex) {
throw new TextExtractorException("Error extracting HTML from content.", ex); logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
throw new ExtractionException("Error extracting HTML from content.", ex);
} }
} }
@Override
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
} }

View File

@ -1,24 +1,23 @@
/* /*
* Autopsy Forensic Browser * Autopsy Forensic Browser
* *
* Copyright 2018-2018 Basis Technology Corp. * Copyright 2018-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org * Contact: carrier <at> sleuthkit <dot> org
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.textextractors;
import com.google.common.io.CharSource;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Iterator; import java.util.Iterator;
@ -28,37 +27,27 @@ import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException; import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader; import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
/** /**
* Dedicated SqliteTextExtractor to solve the problems associated with Tika's * Extracts text from SQLite database files.
* Sqlite parser.
* *
* Tika problems: 1) Tika fails to open virtual tables 2) Tika fails to open * This is a dedicated solution to address the problems associated with
* tables with spaces in table name 3) Tika fails to include the table names in * Tika's sqlite parser (version 1.17), which include the following:
* output (except for the first table it parses) * 1) Virtual tables cause the parser to bail
* 2) Tables that contain spaces in their name are not extracted
* 3) Table names are not included in its output text
*/ */
class SqliteTextExtractor extends ContentTextExtractor { final class SqliteTextExtractor extends TextExtractor {
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
private final AbstractFile file;
@Override public SqliteTextExtractor(Content file) {
boolean isContentTypeSpecific() { this.file = (AbstractFile) file;
return true;
} }
@Override
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(String msg, Exception exception) {
logger.log(Level.WARNING, msg, exception); //NON-NLS
}
/** /**
* Supports only the sqlite mimetypes * Supports only the sqlite mimetypes
* *
@ -68,44 +57,34 @@ class SqliteTextExtractor extends ContentTextExtractor {
* @return true if x-sqlite3 * @return true if x-sqlite3
*/ */
@Override @Override
boolean isSupported(Content file, String detectedFormat) { public boolean isSupported(Content file, String detectedFormat) {
return SQLITE_MIMETYPE.equals(detectedFormat); return SQLITE_MIMETYPE.equals(detectedFormat);
} }
/** /**
* Returns a stream that will read from a sqlite database. * Returns a reader that will iterate over the text of a sqlite database.
* *
* @param source Content file * @param source Content file
* *
* @return An InputStream that reads from a Sqlite database. * @return An InputStream that reads from a Sqlite database
* *
* @throws * @throws TextExtractorException
* org.sleuthkit.autopsy.keywordsearch.TextExtractor.TextExtractorException
*/ */
@Override @Override
public Reader getReader(Content source) throws TextExtractorException { public Reader getReader() throws ExtractionException {
//Firewall for any content that is not an AbstractFile return new SQLiteStreamReader(file);
if (!AbstractFile.class.isInstance(source)) {
try {
return CharSource.wrap("").openStream();
} catch (IOException ex) {
throw new TextExtractorException("", ex);
}
}
return new SQLiteStreamReader((AbstractFile) source);
} }
/** /**
* Produces a continuous stream of characters from a database file. To * Produces a continuous stream of characters from a database file. To
* achieve this, all table names are queues up and a SQLiteTableReader is * achieve this, all table names are queues up and a SQLiteTableReader is
* used to do the actual queries and table iteration. * used to do the actual queries and table iteration.
*/ */
public class SQLiteStreamReader extends Reader { private class SQLiteStreamReader extends Reader {
private final SQLiteTableReader reader; private final SQLiteTableReader reader;
private final AbstractFile file; private final AbstractFile file;
private Iterator<String> tableNames; private Iterator<String> tableNames;
private String currentTableName; private String currentTableName;
@ -217,9 +196,10 @@ class SqliteTextExtractor extends ContentTextExtractor {
} }
/** /**
* Reads database values into the buffer. This function is responsible for * Reads database values into the buffer. This function is responsible
* getting the next table in the queue, initiating calls to the SQLiteTableReader, * for getting the next table in the queue, initiating calls to the
* and filling in any excess bytes that are lingering from the previous call. * SQLiteTableReader, and filling in any excess bytes that are lingering
* from the previous call.
* *
* @throws IOException * @throws IOException
*/ */
@ -255,9 +235,9 @@ class SqliteTextExtractor extends ContentTextExtractor {
reader.read(currentTableName, () -> bufIndex == len); reader.read(currentTableName, () -> bufIndex == len);
} catch (SQLiteTableReaderException ex) { } catch (SQLiteTableReaderException ex) {
logger.log(Level.WARNING, String.format( logger.log(Level.WARNING, String.format(
"Error attempting to read file table: [%s]" //NON-NLS "Error attempting to read file table: [%s]" //NON-NLS
+ " for file: [%s] (id=%d).", currentTableName, //NON-NLS + " for file: [%s] (id=%d).", currentTableName, //NON-NLS
file.getName(), file.getId()), ex.getMessage()); file.getName(), file.getId()), ex.getMessage());
} }
} else { } else {
if (bufIndex == off) { if (bufIndex == off) {
@ -290,8 +270,8 @@ class SqliteTextExtractor extends ContentTextExtractor {
} }
/** /**
* Wrapper that holds the excess bytes that were left over from the previous * Wrapper that holds the excess bytes that were left over from the
* call to read(). * previous call to read().
*/ */
private class ExcessBytes { private class ExcessBytes {

View File

@ -16,19 +16,19 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.textextractors;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Objects;
import java.util.logging.Level; import org.openide.util.Lookup;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskException; import org.sleuthkit.datamodel.TskException;
@ -36,24 +36,25 @@ import org.sleuthkit.datamodel.TskException;
/** /**
* Extracts raw strings from content. * Extracts raw strings from content.
*/ */
class StringsTextExtractor extends ContentTextExtractor { final class StringsTextExtractor extends TextExtractor {
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); private boolean extractUTF8;
private boolean extractUTF16;
/** private final Content content;
* Options for this extractor private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
*/
enum ExtractOptions {
EXTRACT_UTF16, ///< extract UTF16 text, true/false
EXTRACT_UTF8, ///< extract UTF8 text, true/false
};
private final List<SCRIPT> extractScripts = new ArrayList<>(); private final List<SCRIPT> extractScripts = new ArrayList<>();
private Map<String, String> extractOptions = new HashMap<>();
public StringsTextExtractor() { /**
* Creates a default StringsTextExtractor instance. The instance will be
* configured to run only LATIN_2 as its default extraction script and UTF-8
* as its default encoding.
*/
public StringsTextExtractor(Content content) {
//LATIN_2 is the default script //LATIN_2 is the default script
extractScripts.add(SCRIPT.LATIN_2); extractScripts.add(SCRIPT.LATIN_2);
extractUTF8 = true;
this.content = content;
} }
/** /**
@ -61,56 +62,29 @@ class StringsTextExtractor extends ContentTextExtractor {
* *
* @param extractScripts scripts to use * @param extractScripts scripts to use
*/ */
public void setScripts(List<SCRIPT> extractScripts) { public final void setScripts(List<SCRIPT> extractScripts) {
if (extractScripts == null) {
return;
}
this.extractScripts.clear(); this.extractScripts.clear();
this.extractScripts.addAll(extractScripts); this.extractScripts.addAll(extractScripts);
} }
/** /**
* Get the currently used scripts for extraction * Returns a reader that will iterate over the text of the content source.
* *
* @return scripts currently used or null if not supported * @param content Content source of any type
*/
public List<SCRIPT> getScripts() {
return new ArrayList<>(extractScripts);
}
/**
* Get current options
* *
* @return currently used, extractor specific options, or null of not * @return A reader instance that content text can be obtained from
* supported
*/
public Map<String, String> getOptions() {
return extractOptions;
}
/**
* Set extractor specific options
* *
* @param options options to use * @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/ */
public void setOptions(Map<String, String> options) {
this.extractOptions = options;
}
@Override @Override
public void logWarning(final String msg, Exception ex) { public InputStreamReader getReader() {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
@Override
public boolean isDisabled() {
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
return extractUTF8 == false && extractUTF16 == false;
}
@Override
public InputStreamReader getReader(Content content) throws TextExtractorException {
InputStream stringStream = getInputStream(content); InputStream stringStream = getInputStream(content);
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET); return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
} }
InputStream getInputStream(Content content) { InputStream getInputStream(Content content) {
@ -118,27 +92,55 @@ class StringsTextExtractor extends ContentTextExtractor {
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) { if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
return new EnglishOnlyStream(content);//optimal for english, english only return new EnglishOnlyStream(content);//optimal for english, english only
} else { } else {
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16); return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
} }
} }
/**
* Determines how the extraction process will proceed given the settings
* stored in this context instance.
*
* See the DefaultExtractionConfig class in the extractionconfigs package
* for available settings.
*
* @param context Lookup instance containing config classes
*/
@Override @Override
public boolean isContentTypeSpecific() { public void setExtractionSettings(Lookup context) {
return false; if (context != null) {
} DefaultExtractionConfig configInstance = context.lookup(DefaultExtractionConfig.class);
if (configInstance == null) {
@Override return;
public boolean isSupported(Content content, String detectedFormat) { }
// strings can be run on anything. if (Objects.nonNull(configInstance.getExtractUTF8())) {
return true; extractUTF8 = configInstance.getExtractUTF8();
}
if (Objects.nonNull(configInstance.getExtractUTF16())) {
extractUTF16 = configInstance.getExtractUTF16();
}
if (Objects.nonNull(configInstance.getExtractScripts())) {
setScripts(configInstance.getExtractScripts());
}
}
} }
/** /**
* Content input string stream reader/converter - given Content, *
* extract strings from it and return encoded bytes via read() * @return
*/
@Override
public boolean isEnabled() {
return extractUTF8 || extractUTF16;
}
@Override
boolean isSupported(Content file, String detectedFormat) {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
/**
* Content input string stream reader/converter - given Content, extract
* strings from it and return encoded bytes via read()
* *
* Note: the utility supports extraction of only LATIN script and UTF8, * Note: the utility supports extraction of only LATIN script and UTF8,
* UTF16LE, UTF16BE encodings and uses a brute force encoding detection - * UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
@ -150,7 +152,6 @@ class StringsTextExtractor extends ContentTextExtractor {
*/ */
private static class EnglishOnlyStream extends InputStream { private static class EnglishOnlyStream extends InputStream {
private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
private static final String NLS = Character.toString((char) 10); //new line private static final String NLS = Character.toString((char) 10); //new line
private static final int READ_BUF_SIZE = 65536; private static final int READ_BUF_SIZE = 65536;
private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
@ -244,12 +245,7 @@ class StringsTextExtractor extends ContentTextExtractor {
} }
//get char from cur read buf //get char from cur read buf
char c = (char) curReadBuf[readBufOffset++]; char c = (char) curReadBuf[readBufOffset++];
if (c == 0 && singleConsecZero == false) { singleConsecZero = c == 0 && singleConsecZero == false; //preserve the current sequence if max consec. 1 zero char
//preserve the current sequence if max consec. 1 zero char
singleConsecZero = true;
} else {
singleConsecZero = false;
}
if (StringExtract.isPrintableAscii(c)) { if (StringExtract.isPrintableAscii(c)) {
tempString.append(c); tempString.append(c);
++tempStringLen; ++tempStringLen;
@ -328,7 +324,7 @@ class StringsTextExtractor extends ContentTextExtractor {
private int copyToReturn(byte[] b, int off, long len) { private int copyToReturn(byte[] b, int off, long len) {
final String curStringS = curString.toString(); final String curStringS = curString.toString();
//logger.log(Level.INFO, curStringS); //logger.log(Level.INFO, curStringS);
byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET); byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len)); System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
//logger.log(Level.INFO, curStringS); //logger.log(Level.INFO, curStringS);
//copied all string, reset //copied all string, reset
@ -370,7 +366,6 @@ class StringsTextExtractor extends ContentTextExtractor {
*/ */
private static class InternationalStream extends InputStream { private static class InternationalStream extends InputStream {
private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
private static final int FILE_BUF_SIZE = 1024 * 1024; private static final int FILE_BUF_SIZE = 1024 * 1024;
private final Content content; private final Content content;
private final byte[] oneCharBuf = new byte[1]; private final byte[] oneCharBuf = new byte[1];
@ -499,7 +494,7 @@ class StringsTextExtractor extends ContentTextExtractor {
*/ */
private void convert(int numBytes) { private void convert(int numBytes) {
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0); lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET); convertBuff = lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
//reset tracking vars //reset tracking vars
if (lastExtractResult.getNumBytes() == 0) { if (lastExtractResult.getNumBytes() == 0) {
bytesInConvertBuff = 0; bytesInConvertBuff = 0;

View File

@ -0,0 +1,103 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-18 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors;
import java.io.Reader;
import org.openide.util.Lookup;
import org.sleuthkit.datamodel.Content;
/**
* Extracts the text out of {@link org.sleuthkit.datamodel.Content} instances
* and exposes them as a {@link java.io.Reader}. Concrete implementations can be
* obtained from
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}
* or
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
*
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
*/
public abstract class TextExtractor {
/**
* Determines if the file content is supported by the extractor.
*
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/
abstract boolean isSupported(Content file, String detectedFormat);
/**
* Determines if the TextExtractor instance is enabled to read content.
*
* @return
*/
boolean isEnabled() {
return true;
}
/**
* Get a {@link java.io.Reader} that will iterate over the text extracted
* from the {@link org.sleuthkit.datamodel.Content} passed into
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}.
*
* @return {@link java.io.Reader} that contains the text of the underlying
* {@link org.sleuthkit.datamodel.Content}
*
* @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
*
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
*
*/
public abstract Reader getReader() throws ExtractionException;
/**
* Determines how the extraction process will proceed given the settings
* stored in the context instance.
*
* @param context Instance containing file config classes
*/
void setExtractionSettings(Lookup context) {
//no-op by default
}
/**
* Exception encountered during
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
* This indicates that there was an internal parsing error that occurred
* during the reading of Content text.
*/
public class ExtractionException extends Exception {
public ExtractionException(String msg, Throwable ex) {
super(msg, ex);
}
public ExtractionException(Throwable ex) {
super(ex);
}
public ExtractionException(String msg) {
super(msg);
}
}
}

View File

@ -0,0 +1,157 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2018-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors;
import java.util.Arrays;
import java.util.List;
import org.openide.util.Lookup;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Report;
/**
* Factory for creating
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor}'s given a
* {@link org.sleuthkit.datamodel.Content} instance
*
* See {@link org.sleuthkit.autopsy.textextractors.extractionconfigs} for
* available {@link org.sleuthkit.autopsy.textextractors.TextExtractor}
* configuration options.
*
* @see org.openide.util.Lookup
*/
public class TextExtractorFactory {
/**
* Auto detects the correct
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor} given the
* {@link org.sleuthkit.datamodel.Content}.
*
* See {@link org.sleuthkit.autopsy.textextractors.extractionconfigs} for
* available {@link org.sleuthkit.autopsy.textextractors.TextExtractor}
* configuration options.
*
* @param content Content source that will be read from
* @param context Contains extraction configurations for certain file types
*
* @return A TextExtractor that supports the given content. File text can be
* obtained from
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
*
* @throws NoTextExtractorFound Encountered when there is no TextExtractor
* was found for the given content type. Use {@link
* TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
* org.openide.util.Lookup)}
*
* @see org.openide.util.Lookup
*/
public static TextExtractor getExtractor(Content content,
Lookup context) throws NoTextExtractorFound {
if (content instanceof AbstractFile) {
String mimeType = ((AbstractFile) content).getMIMEType();
List<TextExtractor> extractors = Arrays.asList(
new HtmlTextExtractor(content),
new SqliteTextExtractor(content),
new TikaTextExtractor(content));
for (TextExtractor extractor : extractors) {
extractor.setExtractionSettings(context);
if (extractor.isEnabled() && extractor.isSupported(content, mimeType)) {
return extractor;
}
}
} else if (content instanceof BlackboardArtifact) {
TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
artifactExtractor.setExtractionSettings(context);
return artifactExtractor;
} else if (content instanceof Report) {
TextExtractor reportExtractor = new TikaTextExtractor(content);
reportExtractor.setExtractionSettings(context);
return reportExtractor;
}
throw new NoTextExtractorFound(
String.format("Could not find a suitable extractor for "
+ "content with name [%s] and id=[%d]. Try using the default, "
+ "non content specific extractor as an alternative.",
content.getName(), content.getId())
);
}
/**
* Auto detects the correct
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor} given the
* {@link org.sleuthkit.datamodel.Content}.
*
* @param content Content instance that will be read from
*
* @return A TextExtractor that supports the given content. File text can be
* obtained from {@link TextExtractor#getReader()}.
*
* @throws NoTextExtractorFound Encountered when there is no TextExtractor
* was found for the given content type. Use {@link
* TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
* org.openide.util.Lookup)}
*/
public static TextExtractor getExtractor(Content content)
throws NoTextExtractorFound {
return getExtractor(content, null);
}
/**
* Returns the default extractor that can be run on any content type. This
* extractor should be used as a backup in the event that no extractor was
* found using or {@link TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}
* {@link TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}.
*
* @param content Content source to read from
* @param context Contains extraction configurations for certain file types
*
* @return A DefaultExtractor instance. File text can be obtained from
* {@link TextExtractor#getReader()}.
*
* @see org.openide.util.Lookup
*/
public static TextExtractor getDefaultExtractor(Content content, Lookup context) {
TextExtractor stringsInstance = new StringsTextExtractor(content);
stringsInstance.setExtractionSettings(context);
return stringsInstance;
}
/**
* System level exception for handling content types that have no specific
* strategy defined for extracting their text.
*
* @see
* org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)
* @see
* org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
* org.openide.util.Lookup)}
*/
public static class NoTextExtractorFound extends Exception {
public NoTextExtractorFound(String msg) {
super(msg);
}
public NoTextExtractorFound(Throwable ex) {
super(ex);
}
}
}

View File

@ -16,15 +16,19 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.textextractors;
import com.google.common.collect.ImmutableList;
import com.google.common.io.CharSource; import com.google.common.io.CharSource;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.PushbackReader; import java.io.PushbackReader;
import java.io.Reader; import java.io.Reader;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
@ -33,6 +37,7 @@ import java.util.concurrent.TimeoutException;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParser;
@ -44,26 +49,78 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.pdf.PDFParserConfig;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.modules.InstalledFileLocator; import org.openide.modules.InstalledFileLocator;
import org.sleuthkit.autopsy.coreutils.Logger; import org.openide.util.Lookup;
import org.sleuthkit.autopsy.coreutils.PlatformUtil; import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
/** /**
* Extracts text from Tika supported content. Protects against Tika * Extracts text from Tika supported content. Protects against Tika parser hangs
* parser hangs (for unexpected/corrupt content) using a timeout mechanism. * (for unexpected/corrupt content) using a timeout mechanism.
*/ */
class TikaTextExtractor extends ContentTextExtractor { final class TikaTextExtractor extends TextExtractor {
//Mimetype groups to aassist extractor implementations in ignoring binary and
//archive files.
private static final List<String> BINARY_MIME_TYPES
= ImmutableList.of(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/**
* generally text extractors should ignore archives and let unpacking
* modules take care of them
*/
private static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private final AutoDetectParser parser = new AutoDetectParser(); private final AutoDetectParser parser = new AutoDetectParser();
private final Content content;
private boolean tesseractOCREnabled;
private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final File TESSERACT_PATH = locateTesseractExecutable();
private static final String LANGUAGE_PACKS = getLanguagePacks();
private static final List<String> TIKA_SUPPORTED_TYPES private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext()) = new Tika().getParser().getSupportedTypes(new ParseContext())
@ -71,13 +128,23 @@ class TikaTextExtractor extends ContentTextExtractor {
.map(mt -> mt.getType() + "/" + mt.getSubtype()) .map(mt -> mt.getType() + "/" + mt.getSubtype())
.collect(Collectors.toList()); .collect(Collectors.toList());
@Override public TikaTextExtractor(Content content) {
public void logWarning(final String msg, Exception ex) { this.content = content;
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
} }
/**
* Returns a reader that will iterate over the text extracted from Apache
* Tika.
*
* @param content Supported source content to extract
*
* @return Reader that contains Apache Tika extracted text
*
* @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/
@Override @Override
public Reader getReader(Content content) throws TextExtractorException { public Reader getReader() throws ExtractionException {
ReadContentInputStream stream = new ReadContentInputStream(content); ReadContentInputStream stream = new ReadContentInputStream(content);
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
@ -90,28 +157,28 @@ class TikaTextExtractor extends ContentTextExtractor {
officeParserConfig.setUseSAXPptxExtractor(true); officeParserConfig.setUseSAXPptxExtractor(true);
officeParserConfig.setUseSAXDocxExtractor(true); officeParserConfig.setUseSAXDocxExtractor(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig); parseContext.set(OfficeParserConfig.class, officeParserConfig);
// configure OCR if it is enabled in KWS settings and installed on the machine // configure OCR if it is enabled in KWS settings and installed on the machine
if (TESSERACT_PATH != null && KeywordSearchSettings.getOcrOption() && PlatformUtil.isWindowsOS() == true) { if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
// configure PDFParser. // configure PDFParser.
PDFParserConfig pdfConfig = new PDFParserConfig(); PDFParserConfig pdfConfig = new PDFParserConfig();
// Extracting the inline images and letting Tesseract run on each inline image. // Extracting the inline images and letting Tesseract run on each inline image.
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29 // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
pdfConfig.setExtractInlineImages(true); pdfConfig.setExtractInlineImages(true);
// Multiple pages within a PDF file might refer to the same underlying image. // Multiple pages within a PDF file might refer to the same underlying image.
pdfConfig.setExtractUniqueInlineImagesOnly(true); pdfConfig.setExtractUniqueInlineImagesOnly(true);
parseContext.set(PDFParserConfig.class, pdfConfig); parseContext.set(PDFParserConfig.class, pdfConfig);
// Configure Tesseract parser to perform OCR // Configure Tesseract parser to perform OCR
TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
String tesseractFolder = TESSERACT_PATH.getParent(); String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder); ocrConfig.setTesseractPath(tesseractFolder);
// Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata". // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
// If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
ocrConfig.setLanguage("eng"); ocrConfig.setLanguage(LANGUAGE_PACKS);
parseContext.set(TesseractOCRConfig.class, ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig);
} }
@ -124,7 +191,7 @@ class TikaTextExtractor extends ContentTextExtractor {
PushbackReader pushbackReader = new PushbackReader(tikaReader); PushbackReader pushbackReader = new PushbackReader(tikaReader);
int read = pushbackReader.read(); int read = pushbackReader.read();
if (read == -1) { if (read == -1) {
throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content); throw new ExtractionException("Unable to extract text: Tika returned empty reader for " + content);
} }
pushbackReader.unread(read); pushbackReader.unread(read);
@ -133,15 +200,13 @@ class TikaTextExtractor extends ContentTextExtractor {
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream(); return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
} catch (TimeoutException te) { } catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName()); final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
logWarning(msg, te); throw new ExtractionException(msg, te);
throw new TextExtractorException(msg, te); } catch (ExtractionException ex) {
} catch (TextExtractorException ex) {
throw ex; throw ex;
} catch (Exception ex) { } catch (Exception ex) {
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName()); final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
logWarning(msg, ex); throw new ExtractionException(msg, ex);
throw new TextExtractorException(msg, ex);
} finally { } finally {
future.cancel(true); future.cancel(true);
} }
@ -187,16 +252,19 @@ class TikaTextExtractor extends ContentTextExtractor {
)); ));
} }
@Override /**
public boolean isContentTypeSpecific() { * Determines if Tika is supported for this content type and mimetype.
return true; *
} * @param content Source content to read
* @param detectedFormat Mimetype of content
*
* @return Flag indicating support for reading content type
*/
@Override @Override
public boolean isSupported(Content content, String detectedFormat) { public boolean isSupported(Content content, String detectedFormat) {
if (detectedFormat == null if (detectedFormat == null
|| ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used) || BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|| ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat) || ARCHIVE_MIME_TYPES.contains(detectedFormat)
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
) { ) {
@ -205,9 +273,34 @@ class TikaTextExtractor extends ContentTextExtractor {
return TIKA_SUPPORTED_TYPES.contains(detectedFormat); return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
} }
@Override /**
public boolean isDisabled() { * Retrieves all of the installed language packs from their designated
return false; * directory location to be used to configure Tesseract OCR.
*
* @return String of all language packs available for Tesseract to use
*/
private static String getLanguagePacks() {
File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
//Acceptable extensions for Tesseract-OCR version 3.05 language packs.
//All extensions other than traineddata are associated with cube files that
//have been made obsolete since version 4.0.
List<String> acceptableExtensions = Arrays.asList("traineddata", "params",
"lm", "fold", "bigrams", "nn", "word-freq", "size",
"user-patterns", "user-words");
//Pull out only unique languagePacks
HashSet<String> languagePacks = new HashSet<>();
if (languagePackRootDir.exists()) {
for (File languagePack : languagePackRootDir.listFiles()) {
if (languagePack.isDirectory() || !acceptableExtensions.contains(
FilenameUtils.getExtension(languagePack.getName()))) {
continue;
}
String threeLetterPackageName = languagePack.getName().substring(0, 3);
//Ignore the eng language pack if accidentally added
languagePacks.add(threeLetterPackageName);
}
}
return String.join("+", languagePacks);
} }
/** /**
@ -233,6 +326,28 @@ class TikaTextExtractor extends ContentTextExtractor {
} }
/**
* Determines how the extraction process will proceed given the settings
* stored in this context instance.
*
* See the ImageFileExtractionConfig class in the extractionconfigs package
* for available settings.
*
* @param context Instance containing config classes
*/
@Override
public void setExtractionSettings(Lookup context) {
if (context != null) {
ImageFileExtractionConfig configInstance = context.lookup(ImageFileExtractionConfig.class);
if (configInstance == null) {
return;
}
if (Objects.nonNull(configInstance.getOCREnabled())) {
this.tesseractOCREnabled = configInstance.getOCREnabled();
}
}
}
/** /**
* An implementation of CharSource that just wraps an existing reader and * An implementation of CharSource that just wraps an existing reader and
* returns it in openStream(). * returns it in openStream().

View File

@ -0,0 +1,100 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2018-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors.extractionconfigs;
import java.util.List;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
/**
* Allows for configuration of the
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor} obtained from
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
*
* The default extractor will read strings from the Content instance. This class
* allows for the configuration of the encoding language script to use during
* extraction.
*
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
* @see
* org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
* @see org.openide.util.Lookup
*/
public class DefaultExtractionConfig {
private Boolean extractUTF8;
private Boolean extractUTF16;
private List<SCRIPT> extractScripts;
/**
* Enables UTF-8 encoding to be used during extraction.
*
* @param enabled Flag indicating if UTF-8 should be turned on
*/
public void setExtractUTF8(boolean enabled) {
this.extractUTF8 = enabled;
}
/**
* Enables UTF-16 encoding to be used during extraction.
*
* @param enabled Flag indicating if UTF-16 should be turned on
*/
public void setExtractUTF16(boolean enabled) {
this.extractUTF16 = enabled;
}
/**
* Returns whether extracting with UTF-8 encoding should be done.
*
* @return Flag indicating if UTF-8 has been turned on/off
*/
public Boolean getExtractUTF8() {
return extractUTF8;
}
/**
* Return whether extracting with UTF-16 encoding should be done.
*
* @return Flag indicating if UTF-16 has been turned on/off
*/
public Boolean getExtractUTF16() {
return extractUTF16;
}
/**
* Sets the type of extraction scripts that will be used during this
* extraction. See
* {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT}
* for more information about available scripts.
*
* @param scripts Desired set of scripts to be used during extraction
*/
public void setExtractScripts(List<SCRIPT> scripts) {
this.extractScripts = scripts;
}
/**
* Gets the desired set of scripts to be used during extraction.
*
* @return Set of extraction scripts to be used
*/
public List<SCRIPT> getExtractScripts() {
return this.extractScripts;
}
}

View File

@ -0,0 +1,54 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2018-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors.extractionconfigs;
/**
* Allows for configuration of OCR on image files.
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor}'s that use
* ImageFileExtractionConfig can be obtained through
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}
* or
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
*
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
* @see org.openide.util.Lookup
*/
public class ImageFileExtractionConfig {
private Boolean OCREnabled;
/**
* Enables OCR to be run on the text extractor responsible for handling
* image files.
*
* @param enabled Flag indicating if OCR is enabled.
*/
public void setOCREnabled(boolean enabled) {
this.OCREnabled = enabled;
}
/**
* Gets the OCR flag that has been set. By default this flag is turned off.
*
* @return Flag indicating if OCR is enabled.
*/
public boolean getOCREnabled() {
return this.OCREnabled;
}
}

View File

@ -19,7 +19,7 @@
package org.sleuthkit.autopsy.texttranslation; package org.sleuthkit.autopsy.texttranslation;
/** /**
* Provides a system exception for the Text Translation errors * Provides a system exception for Text Translation errors
*/ */
public class TranslationException extends Exception { public class TranslationException extends Exception {

View File

@ -1,150 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.logging.Level;
import org.apache.commons.io.IOUtils;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Extracts text from artifacts by concatenating the values of all of the
* artifact's attributes.
*/
class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
/**
* Get the Content that is the data source for the given artifact. //JMTODO:
* is there a prexisting method to do this?
*
* @param artifact
*
* @return The data source for the given artifact as a Content object, or
* null if it could not be found.
*
* @throws TskCoreException if there is a problem accessing the case db.
*/
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
Case currentCase;
try {
currentCase = Case.getCurrentCaseThrows();
} catch (NoCurrentCaseException ignore) {
// thorown by Case.getCurrentOpenCase() if currentCase is null
return null;
}
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
if (sleuthkitCase == null) {
return null;
}
Content dataSource;
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
if (abstractFile != null) {
dataSource = abstractFile.getDataSource();
} else {
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
}
if (dataSource == null) {
return null;
}
return dataSource;
}
@Override
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException {
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
Content dataSource = null;
try {
dataSource = getDataSource(artifact);
} catch (TskCoreException tskCoreException) {
throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
}
if (dataSource == null) {
throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
}
try {
for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : ");
// We have also discussed modifying BlackboardAttribute.getDisplayString()
// to magically format datetime attributes but that is complicated by
// the fact that BlackboardAttribute exists in Sleuthkit data model
// while the utility to determine the timezone to use is in ContentUtils
// in the Autopsy datamodel.
switch (attribute.getValueType()) {
case DATETIME:
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
break;
default:
artifactContents.append(attribute.getDisplayString());
}
artifactContents.append(System.lineSeparator());
}
} catch (TskCoreException tskCoreException) {
throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
}
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
}
@Override
public Reader getReader(BlackboardArtifact source) throws TextExtractorException {
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
}
@Override
public long getID(BlackboardArtifact source) {
return source.getArtifactID();
}
@Override
public String getName(BlackboardArtifact source) {
return source.getDisplayName() + "_" + source.getArtifactID();
}
}

View File

@ -1,110 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
import org.sleuthkit.datamodel.Content;
/**
* Common methods for utilities that extract text and content and divide into
* chunks
*/
abstract class ContentTextExtractor implements TextExtractor<Content> {
static final List<String> BINARY_MIME_TYPES
= Arrays.asList(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
static final List<String> ARCHIVE_MIME_TYPES
= Arrays.asList(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/**
* Determines if the extractor works only for specified types is
* supportedTypes() or whether is a generic content extractor (such as
* string extractor)
*
* @return
*/
abstract boolean isContentTypeSpecific();
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
*
* @param content to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/
abstract boolean isSupported(Content file, String detectedFormat);
@Override
public abstract Reader getReader(Content source) throws TextExtractorException;
@Override
public long getID(Content source) {
return source.getId();
}
@Override
public String getName(Content source) {
return source.getName();
}
}

View File

@ -19,6 +19,7 @@
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.Reader;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.logging.Level; import java.util.logging.Level;
@ -58,7 +59,6 @@ class Ingester {
private final Server solrServer = KeywordSearch.getServer(); private final Server solrServer = KeywordSearch.getServer();
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor(); private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
private static Ingester instance; private static Ingester instance;
private static final int SINGLE_READ_CHARS = 512;
private Ingester() { private Ingester() {
} }
@ -106,8 +106,8 @@ class Ingester {
* @throws IngesterException if there was an error processing a specific * @throws IngesterException if there was an error processing a specific
* artifact, but the Solr server is probably fine. * artifact, but the Solr server is probably fine.
*/ */
void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException { void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact)); indexChunk("", sourceName, getContentFields(artifact));
} }
/** /**
@ -142,23 +142,12 @@ class Ingester {
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/ */
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException { < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
final long sourceID = extractor.getID(source);
final String sourceName = extractor.getName(source);
int numChunks = 0; //unknown until chunking is done int numChunks = 0; //unknown until chunking is done
if (extractor.isDisabled()) {
/*
* some Extractors, notable the strings extractor, have options
* which can be configured such that no extraction should be done
*/
return true;
}
Map<String, String> fields = getContentFields(source); Map<String, String> fields = getContentFields(source);
//Get a reader for the content of the given source //Get a reader for the content of the given source
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) { try (BufferedReader reader = new BufferedReader(sourceReader)) {
Chunker chunker = new Chunker(reader); Chunker chunker = new Chunker(reader);
for (Chunk chunk : chunker) { for (Chunk chunk : chunker) {
if (context != null && context.fileIngestIsCancelled()) { if (context != null && context.fileIngestIsCancelled()) {
@ -173,18 +162,18 @@ class Ingester {
indexChunk(chunk.toString(), sourceName, fields); indexChunk(chunk.toString(), sourceName, fields);
numChunks++; numChunks++;
} catch (Ingester.IngesterException ingEx) { } catch (Ingester.IngesterException ingEx) {
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
throw ingEx; //need to rethrow to signal error and move on throw ingEx; //need to rethrow to signal error and move on
} }
} }
if (chunker.hasException()) { if (chunker.hasException()) {
extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException()); logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
return false; return false;
} }
} catch (Exception ex) { } catch (Exception ex) {
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false; return false;
} finally { } finally {
if (context != null && context.fileIngestIsCancelled()) { if (context != null && context.fileIngestIsCancelled()) {
@ -371,7 +360,7 @@ class Ingester {
Map<String, String> params = new HashMap<>(); Map<String, String> params = new HashMap<>();
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID())); params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
try { try {
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId())); params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
} catch (TskCoreException ex) { } catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1)); params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

View File

@ -35,6 +35,7 @@ import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestManager; import org.sleuthkit.autopsy.ingest.IngestManager;
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
/** /**
* Child panel of the global settings panel (Languages tab). * Child panel of the global settings panel (Languages tab).
@ -45,7 +46,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
private final Map<String, StringExtract.StringExtractUnicodeTable.SCRIPT> scripts = new HashMap<>(); private final Map<String, StringExtract.StringExtractUnicodeTable.SCRIPT> scripts = new HashMap<>();
private ActionListener updateLanguagesAction; private ActionListener updateLanguagesAction;
private List<SCRIPT> toUpdate; private List<SCRIPT> toUpdate;
KeywordSearchGlobalLanguageSettingsPanel() { KeywordSearchGlobalLanguageSettingsPanel() {
initComponents(); initComponents();
customizeComponents(); customizeComponents();
@ -125,12 +126,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
private void reloadScriptsCheckBoxes() { private void reloadScriptsCheckBoxes() {
boolean utf16 boolean utf16
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString()));
enableUTF16Checkbox.setSelected(utf16); enableUTF16Checkbox.setSelected(utf16);
boolean utf8 boolean utf8
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString()));
enableUTF8Checkbox.setSelected(utf8); enableUTF8Checkbox.setSelected(utf8);
boolean ocr = KeywordSearchSettings.getOcrOption(); boolean ocr = KeywordSearchSettings.getOcrOption();
@ -152,12 +153,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
reloadScriptsCheckBoxes(); reloadScriptsCheckBoxes();
boolean utf16 boolean utf16
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString()));
enableUTF16Checkbox.setSelected(utf16); enableUTF16Checkbox.setSelected(utf16);
boolean utf8 boolean utf8
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString()));
enableUTF8Checkbox.setSelected(utf8); enableUTF8Checkbox.setSelected(utf8);
final boolean extractEnabled = utf16 || utf8; final boolean extractEnabled = utf16 || utf8;
@ -316,9 +317,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
@Override @Override
public void store() { public void store() {
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString(),
Boolean.toString(enableUTF8Checkbox.isSelected())); Boolean.toString(enableUTF8Checkbox.isSelected()));
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString(),
Boolean.toString(enableUTF16Checkbox.isSelected())); Boolean.toString(enableUTF16Checkbox.isSelected()));
KeywordSearchSettings.setOcrOption(enableOcrCheckbox.isSelected()); KeywordSearchSettings.setOcrOption(enableOcrCheckbox.isSelected());

View File

@ -18,14 +18,18 @@
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.util.ArrayList; import com.google.common.collect.ImmutableList;
import java.io.Reader;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level; import java.util.logging.Level;
import org.openide.util.Exceptions;
import org.openide.util.Lookup;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages; import org.openide.util.NbBundle.Messages;
import org.openide.util.lookup.Lookups;
import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
@ -37,9 +41,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter; import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import org.sleuthkit.autopsy.ingest.IngestServices; import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskData; import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown; import org.sleuthkit.datamodel.TskData.FileKnown;
@ -61,6 +71,52 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
"CannotRunFileTypeDetection=Unable to run file type detection." "CannotRunFileTypeDetection=Unable to run file type detection."
}) })
public final class KeywordSearchIngestModule implements FileIngestModule { public final class KeywordSearchIngestModule implements FileIngestModule {
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
public static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/**
* Options for this extractor
*/
enum StringsExtractOptions {
EXTRACT_UTF16, ///< extract UTF16 text, true/false
EXTRACT_UTF8, ///< extract UTF8 text, true/false
};
enum UpdateFrequency { enum UpdateFrequency {
@ -89,13 +145,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//accessed read-only by searcher thread //accessed read-only by searcher thread
private boolean startedSearching = false; private boolean startedSearching = false;
private List<ContentTextExtractor> textExtractors; private Lookup stringsExtractionContext;
private StringsTextExtractor stringExtractor;
private TextFileExtractor txtFileExtractor;
private final KeywordSearchJobSettings settings; private final KeywordSearchJobSettings settings;
private boolean initialized = false; private boolean initialized = false;
private long jobId; private long jobId;
private long dataSourceId;
private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
private int instanceNum = 0; private int instanceNum = 0;
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter(); private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
@ -152,7 +205,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
public void startUp(IngestJobContext context) throws IngestModuleException { public void startUp(IngestJobContext context) throws IngestModuleException {
initialized = false; initialized = false;
jobId = context.getJobId(); jobId = context.getJobId();
dataSourceId = context.getDataSource().getId();
Server server = KeywordSearch.getServer(); Server server = KeywordSearch.getServer();
if (server.coreIsOpen() == false) { if (server.coreIsOpen() == false) {
@ -238,22 +290,15 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
} }
} }
} }
//initialize extractors DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
stringExtractor = new StringsTextExtractor(); Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts()); stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions()); stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
txtFileExtractor = new TextFileExtractor();
stringsExtractionContext = Lookups.fixed(stringsConfig);
textExtractors = new ArrayList<>();
//order matters, more specific extractors first
textExtractors.add(new HtmlTextExtractor());
//Add sqlite text extractor to be default for sqlite files, since tika stuggles
//with them. See SqliteTextExtractor class for specifics
textExtractors.add(new SqliteTextExtractor());
textExtractors.add(new TikaTextExtractor());
indexer = new Indexer(); indexer = new Indexer();
initialized = true; initialized = true;
} }
@ -345,10 +390,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* Common cleanup code when module stops or final searcher completes * Common cleanup code when module stops or final searcher completes
*/ */
private void cleanup() { private void cleanup() {
textExtractors.clear(); stringsExtractionContext = null;
textExtractors = null;
stringExtractor = null;
txtFileExtractor = null;
initialized = false; initialized = false;
} }
@ -436,24 +478,18 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* @throws IngesterException exception thrown if indexing failed * @throws IngesterException exception thrown if indexing failed
*/ */
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException { private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
ContentTextExtractor extractor = null; ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
//go over available text extractors in order, and pick the first one (most specific one) Lookup extractionContext = Lookups.fixed(imageConfig);
for (ContentTextExtractor fe : textExtractors) {
if (fe.isSupported(aFile, detectedFormat)) { try {
extractor = fe; Reader specializedReader = TextExtractorFactory.getExtractor(aFile,extractionContext).getReader();
break; //divide into chunks and index
} return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
} } catch (TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
//No text extractor found... run the default instead
if (extractor == null) {
// No text extractor found.
return false; return false;
} }
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
//divide into chunks and index
return Ingester.getDefault().indexText(extractor, aFile, context);
} }
/** /**
@ -469,7 +505,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) { if (context.fileIngestIsCancelled()) {
return true; return true;
} }
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) { Reader stringsReader = TextExtractorFactory.getDefaultExtractor(aFile, stringsExtractionContext).getReader();
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED); putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
return true; return true;
} else { } else {
@ -477,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT); putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
return false; return false;
} }
} catch (IngesterException ex) { } catch (IngesterException | ExtractionException ex) {
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING); putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
return false; return false;
@ -529,7 +566,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
// we skip archive formats that are opened by the archive module. // we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though... // @@@ We could have a check here to see if the archive module was enabled though...
if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) { if (ARCHIVE_MIME_TYPES.contains(fileType)) {
try { try {
if (context.fileIngestIsCancelled()) { if (context.fileIngestIsCancelled()) {
return; return;
@ -577,11 +614,13 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//Carved Files should be the only type of unallocated files capable of a txt extension and //Carved Files should be the only type of unallocated files capable of a txt extension and
//should be ignored by the TextFileExtractor because they may contain more than one text encoding //should be ignored by the TextFileExtractor because they may contain more than one text encoding
try { try {
if (Ingester.getDefault().indexText(txtFileExtractor, aFile, context)) { TextFileExtractor textFileExtractor = new TextFileExtractor();
Reader textReader = textFileExtractor.getReader(aFile);
if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED); putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
wasTextAdded = true; wasTextAdded = true;
} }
} catch (IngesterException ex) { } catch (IngesterException | TextFileExtractorException ex) {
logger.log(Level.WARNING, "Unable to index as unicode", ex); logger.log(Level.WARNING, "Unable to index as unicode", ex);
} }
} }

View File

@ -31,6 +31,7 @@ import javax.swing.table.TableColumn;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettings; import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettings;
import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettingsPanel; import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettingsPanel;
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
/** /**
* Ingest job settings panel for keyword search file ingest modules. * Ingest job settings panel for keyword search file ingest modules.
@ -102,8 +103,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
} }
private void displayEncodings() { private void displayEncodings() {
String utf8 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()); String utf8 = KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString());
String utf16 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()); String utf16 = KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString());
ArrayList<String> encodingsList = new ArrayList<>(); ArrayList<String> encodingsList = new ArrayList<>();
if (utf8 == null || Boolean.parseBoolean(utf8)) { if (utf8 == null || Boolean.parseBoolean(utf8)) {
encodingsList.add("UTF8"); encodingsList.add("UTF8");

View File

@ -28,6 +28,7 @@ import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.ModuleSettings; import org.sleuthkit.autopsy.coreutils.ModuleSettings;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.UpdateFrequency; import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.UpdateFrequency;
//This file contains constants and settings for KeywordSearch //This file contains constants and settings for KeywordSearch
@ -234,14 +235,14 @@ class KeywordSearchSettings {
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT); KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
} }
//setting default Extract UTF8 //setting default Extract UTF8
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) { if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsExtractOptions.EXTRACT_UTF8.toString())) {
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString()); KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
} }
//setting default Extract UTF16 //setting default Extract UTF16
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) { if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsExtractOptions.EXTRACT_UTF16.toString())) {
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString()); KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
} }
//setting OCR default (disabled by default) //setting OCR default (disabled by default)
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, OCR_ENABLED)) { if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, OCR_ENABLED)) {

View File

@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
import java.net.InetAddress; import java.net.InetAddress;
import java.util.ArrayList; import java.util.ArrayList;
@ -45,6 +46,9 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
import org.sleuthkit.autopsy.progress.ProgressIndicator; import org.sleuthkit.autopsy.progress.ProgressIndicator;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
@ -112,19 +116,24 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
return; return;
} }
try { try {
ingester.indexMetaDataOnly(artifact); Reader blackboardReader = TextExtractorFactory
ingester.indexText(new ArtifactTextExtractor(), artifact, null); .getExtractor(content, null).getReader();
} catch (Ingester.IngesterException ex) { String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
ingester.indexMetaDataOnly(artifact, sourceName);
ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
} catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }
} else { } else {
try { try {
ingester.indexText(new TikaTextExtractor(), content, null); Reader contentReader = TextExtractorFactory
} catch (Ingester.IngesterException ex) { .getExtractor(content, null).getReader();
ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
} catch (TextExtractorFactory.NoTextExtractorFound | ExtractionException | Ingester.IngesterException ex) {
try { try {
// Try the StringsTextExtractor if Tika extractions fails. // Try the StringsTextExtractor if Tika extractions fails.
ingester.indexText(new StringsTextExtractor(), content, null); ingester.indexText(TextExtractorFactory.getDefaultExtractor(content, null).getReader(),content.getId(),content.getName(), content, null);
} catch (Ingester.IngesterException ex1) { } catch (Ingester.IngesterException | ExtractionException ex1) {
throw new TskCoreException(ex.getCause().getMessage(), ex1); throw new TskCoreException(ex.getCause().getMessage(), ex1);
} }
} }
@ -437,9 +446,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
final Ingester ingester = Ingester.getDefault(); final Ingester ingester = Ingester.getDefault();
try { try {
ingester.indexMetaDataOnly(artifact); String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
ingester.indexText(new ArtifactTextExtractor(), artifact, null); Reader contentSpecificReader =
} catch (Ingester.IngesterException ex) { TextExtractorFactory.getExtractor((Content) artifact, null).getReader();
ingester.indexMetaDataOnly(artifact, sourceName);
ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
} catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }
} }

View File

@ -1,88 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-16 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.Reader;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
/**
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
* This Reader is given to the Ingester to chunk and index in Solr.
*
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
* is able to process.
*/
interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
/**
* Is this extractor configured such that no extraction will/should be done?
*
* @return True if this extractor will/should not perform any extraction.
*/
abstract boolean isDisabled();
/**
* Log the given message and exception as a warning.
*
* @param msg
* @param ex
*/
abstract void logWarning(String msg, Exception ex);
/**
* Get a reader that over the text extracted from the given source.
*
* @param stream
* @param source
*
* @return
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
abstract Reader getReader(TextSource source) throws TextExtractorException;
/**
* Get the 'object' id of the given source.
*
* @param source
*
* @return
*/
abstract long getID(TextSource source);
/**
* Get a human readable name for the given source.
*
* @param source
*
* @return
*/
abstract String getName(TextSource source);
class TextExtractorException extends Exception {
public TextExtractorException(String message) {
super(message);
}
public TextExtractorException(String message, Throwable cause) {
super(message, cause);
}
}
}

View File

@ -21,17 +21,15 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.Reader; import java.io.Reader;
import java.util.logging.Level;
import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
/** /**
* Extract text from .txt files * Extract text from .txt files
*/ */
final class TextFileExtractor extends ContentTextExtractor { final class TextFileExtractor {
//Set a Minimum confidence value to reject matches that may not have a valid text encoding //Set a Minimum confidence value to reject matches that may not have a valid text encoding
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50, //Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
@ -39,44 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor {
//This limited information was used to select the current value as one that would filter out clearly non-text //This limited information was used to select the current value as one that would filter out clearly non-text
//files while hopefully working on all files with a valid text encoding //files while hopefully working on all files with a valid text encoding
static final private int MIN_MATCH_CONFIDENCE = 20; static final private int MIN_MATCH_CONFIDENCE = 20;
static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
@Override public Reader getReader(AbstractFile source) throws TextFileExtractorException {
boolean isContentTypeSpecific() {
return true;
}
@Override
boolean isSupported(Content file, String detectedFormat) {
return true;
}
@Override
public Reader getReader(Content source) throws TextExtractorException {
CharsetDetector detector = new CharsetDetector(); CharsetDetector detector = new CharsetDetector();
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source)); InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
try { try {
detector.setText(stream); detector.setText(stream);
} catch (IOException ex) { } catch (IOException ex) {
throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex); throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
} }
CharsetMatch match = detector.detect(); CharsetMatch match = detector.detect();
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) { if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor"); throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
} }
return match.getReader(); return match.getReader();
} }
@Override public class TextFileExtractorException extends Exception {
public boolean isDisabled() { public TextFileExtractorException(String msg, Throwable ex) {
return false; super(msg, ex);
}
public TextFileExtractorException(String msg) {
super(msg);
}
} }
@Override
public void logWarning(String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex);
}
} }