mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-14 17:06:16 +00:00
Merge pull request #4330 from dannysmyda/4425-text-abstraction-impl
4425 - Move TextExtractors out of KWS and into Core.
This commit is contained in:
commit
14d6c52e09
@ -35,6 +35,11 @@
|
|||||||
<dependency conf="core->default" org="com.fasterxml.jackson.core" name="jackson-core" rev="2.9.7"/>
|
<dependency conf="core->default" org="com.fasterxml.jackson.core" name="jackson-core" rev="2.9.7"/>
|
||||||
|
|
||||||
<dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/>
|
<dependency conf="core->default" org="commons-validator" name="commons-validator" rev="1.6"/>
|
||||||
|
<dependency conf="core->default" org="net.htmlparser.jericho" name="jericho-html" rev="3.3"/>
|
||||||
|
|
||||||
|
<!-- Tika 1.14 seems to declare a (transitive?) dependency on cleartk-util 3.2.2, but the most recent
|
||||||
|
version available is 2.0.0 Overriding the version worked-->
|
||||||
|
<override org="org.cleartk" module="cleartk-util" rev="2.0.0"/>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</ivy-module>
|
</ivy-module>
|
||||||
|
@ -1,26 +1,59 @@
|
|||||||
file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar
|
file.reference.activemq-all-5.11.1.jar=release/modules/ext/activemq-all-5.11.1.jar
|
||||||
|
file.reference.apache-mime4j-core-0.8.1.jar=release/modules/ext/apache-mime4j-core-0.8.1.jar
|
||||||
|
file.reference.apache-mime4j-dom-0.8.1.jar=release/modules/ext/apache-mime4j-dom-0.8.1.jar
|
||||||
|
file.reference.asm-5.0.4.jar=release/modules/ext/asm-5.0.4.jar
|
||||||
|
file.reference.bcmail-jdk15on-1.54.jar=release/modules/ext/bcmail-jdk15on-1.54.jar
|
||||||
|
file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
|
||||||
|
file.reference.boilerpipe-1.1.0.jar=release/modules/ext/boilerpipe-1.1.0.jar
|
||||||
file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar
|
file.reference.c3p0-0.9.5.jar=release/modules/ext/c3p0-0.9.5.jar
|
||||||
|
file.reference.cdm-4.5.5.jar=release/modules/ext/cdm-4.5.5.jar
|
||||||
|
file.reference.commons-codec-1.6.jar=release/modules/ext/commons-codec-1.6.jar
|
||||||
file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar
|
file.reference.commons-compress-1.14.jar=release/modules/ext/commons-compress-1.14.jar
|
||||||
file.reference.commons-dbcp2-2.1.1.jar=release\\modules\\ext\\commons-dbcp2-2.1.1.jar
|
file.reference.commons-dbcp2-2.1.1.jar=release/modules/ext/commons-dbcp2-2.1.1.jar
|
||||||
file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4.2.jar
|
file.reference.commons-io-2.5.jar=release/modules/ext/commons-io-2.5.jar
|
||||||
|
file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
|
||||||
file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar
|
file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar
|
||||||
|
file.reference.geoapi-3.0.0.jar=release/modules/ext/geoapi-3.0.0.jar
|
||||||
|
file.reference.grib-4.5.5.jar=release/modules/ext/grib-4.5.5.jar
|
||||||
|
file.reference.gson-2.8.1.jar=release/modules/ext/gson-2.8.1.jar
|
||||||
|
file.reference.httpservices-4.5.5.jar=release/modules/ext/httpservices-4.5.5.jar
|
||||||
|
file.reference.isoparser-1.1.18.jar=release/modules/ext/isoparser-1.1.18.jar
|
||||||
|
file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
|
||||||
|
file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
|
||||||
|
file.reference.java-libpst-0.8.1.jar=release/modules/ext/java-libpst-0.8.1.jar
|
||||||
|
file.reference.jcl-over-slf4j-1.7.24.jar=release/modules/ext/jcl-over-slf4j-1.7.24.jar
|
||||||
file.reference.jackson-core-2.9.7.jar=release/modules/ext/jackson-core-2.9.7.jar
|
file.reference.jackson-core-2.9.7.jar=release/modules/ext/jackson-core-2.9.7.jar
|
||||||
file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar
|
file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar
|
||||||
file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar
|
file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar
|
||||||
|
file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
|
||||||
file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar
|
file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar
|
||||||
|
file.reference.jhighlight-1.0.2.jar=release/modules/ext/jhighlight-1.0.2.jar
|
||||||
|
file.reference.jmatio-1.2.jar=release/modules/ext/jmatio-1.2.jar
|
||||||
|
file.reference.json-1.8.jar=release/modules/ext/json-1.8.jar
|
||||||
|
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
|
||||||
file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar
|
file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar
|
||||||
|
file.reference.jul-to-slf4j-1.7.24.jar=release/modules/ext/jul-to-slf4j-1.7.24.jar
|
||||||
|
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
|
||||||
|
file.reference.junrar-0.7.jar=release/modules/ext/junrar-0.7.jar
|
||||||
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
|
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
|
||||||
file.reference.mchange-commons-java-0.2.9.jar=release/modules/ext/mchange-commons-java-0.2.9.jar
|
file.reference.mchange-commons-java-0.2.9.jar=release/modules/ext/mchange-commons-java-0.2.9.jar
|
||||||
file.reference.metadata-extractor-2.10.1.jar=release/modules/ext/metadata-extractor-2.10.1.jar
|
file.reference.metadata-extractor-2.10.1.jar=release/modules/ext/metadata-extractor-2.10.1.jar
|
||||||
|
file.reference.netcdf4-4.5.5.jar=release/modules/ext/netcdf4-4.5.5.jar
|
||||||
|
file.reference.opennlp-tools-1.8.3.jar=release/modules/ext/opennlp-tools-1.8.3.jar
|
||||||
|
file.reference.poi-3.17.jar=release/modules/ext/poi-3.17.jar
|
||||||
|
file.reference.poi-ooxml-3.17.jar=release/modules/ext/poi-ooxml-3.17.jar
|
||||||
|
file.reference.poi-scratchpad-3.17.jar=release/modules/ext/poi-scratchpad-3.17.jar
|
||||||
file.reference.postgresql-9.4.1211.jre7.jar=release/modules/ext/postgresql-9.4.1211.jre7.jar
|
file.reference.postgresql-9.4.1211.jre7.jar=release/modules/ext/postgresql-9.4.1211.jre7.jar
|
||||||
file.reference.Rejistry-1.0-SNAPSHOT.jar=release/modules/ext/Rejistry-1.0-SNAPSHOT.jar
|
file.reference.Rejistry-1.0-SNAPSHOT.jar=release/modules/ext/Rejistry-1.0-SNAPSHOT.jar
|
||||||
|
file.reference.rome-1.5.1.jar=release/modules/ext/rome-1.5.1.jar
|
||||||
file.reference.sevenzipjbinding-AllPlatforms.jar=release/modules/ext/sevenzipjbinding-AllPlatforms.jar
|
file.reference.sevenzipjbinding-AllPlatforms.jar=release/modules/ext/sevenzipjbinding-AllPlatforms.jar
|
||||||
file.reference.sevenzipjbinding.jar=release/modules/ext/sevenzipjbinding.jar
|
file.reference.sevenzipjbinding.jar=release/modules/ext/sevenzipjbinding.jar
|
||||||
file.reference.sqlite-jdbc-3.8.11.jar=release\\modules\\ext\\sqlite-jdbc-3.8.11.jar
|
file.reference.sis-metadata-0.6.jar=release/modules/ext/sis-metadata-0.6.jar
|
||||||
|
file.reference.sis-netcdf-0.6.jar=release/modules/ext/sis-netcdf-0.6.jar
|
||||||
|
file.reference.sis-utility-0.6.jar=release/modules/ext/sis-utility-0.6.jar
|
||||||
|
file.reference.slf4j-api-1.7.24.jar=release/modules/ext/slf4j-api-1.7.24.jar
|
||||||
|
file.reference.sqlite-jdbc-3.8.11.jar=release/modules/ext/sqlite-jdbc-3.8.11.jar
|
||||||
file.reference.StixLib.jar=release/modules/ext/StixLib.jar
|
file.reference.StixLib.jar=release/modules/ext/StixLib.jar
|
||||||
file.reference.bcprov-jdk15on-1.54.jar=release/modules/ext/bcprov-jdk15on-1.54.jar
|
|
||||||
file.reference.jackcess-2.2.0.jar=release/modules/ext/jackcess-2.2.0.jar
|
|
||||||
file.reference.jackcess-encrypt-2.1.4.jar=release/modules/ext/jackcess-encrypt-2.1.4.jar
|
|
||||||
file.reference.jempbox-1.8.13.jar=release/modules/ext/jempbox-1.8.13.jar
|
file.reference.jempbox-1.8.13.jar=release/modules/ext/jempbox-1.8.13.jar
|
||||||
file.reference.javax.ws.rs-api-2.0.1.jar=release/modules/ext/javax.ws.rs-api-2.0.1.jar
|
file.reference.javax.ws.rs-api-2.0.1.jar=release/modules/ext/javax.ws.rs-api-2.0.1.jar
|
||||||
file.reference.cxf-core-3.0.16.jar=release/modules/ext/cxf-core-3.0.16.jar
|
file.reference.cxf-core-3.0.16.jar=release/modules/ext/cxf-core-3.0.16.jar
|
||||||
@ -31,11 +64,14 @@ file.reference.fontbox-2.0.8.jar=release/modules/ext/fontbox-2.0.8.jar
|
|||||||
file.reference.pdfbox-2.0.8.jar=release/modules/ext/pdfbox-2.0.8.jar
|
file.reference.pdfbox-2.0.8.jar=release/modules/ext/pdfbox-2.0.8.jar
|
||||||
file.reference.pdfbox-tools-2.0.8.jar=release/modules/ext/pdfbox-tools-2.0.8.jar
|
file.reference.pdfbox-tools-2.0.8.jar=release/modules/ext/pdfbox-tools-2.0.8.jar
|
||||||
file.reference.sleuthkit-postgresql-4.6.4.jar=release/modules/ext/sleuthkit-postgresql-4.6.4.jar
|
file.reference.sleuthkit-postgresql-4.6.4.jar=release/modules/ext/sleuthkit-postgresql-4.6.4.jar
|
||||||
|
file.reference.tagsoup-1.2.1.jar=release/modules/ext/tagsoup-1.2.1.jar
|
||||||
file.reference.tika-core-1.17.jar=release/modules/ext/tika-core-1.17.jar
|
file.reference.tika-core-1.17.jar=release/modules/ext/tika-core-1.17.jar
|
||||||
file.reference.tika-parsers-1.17.jar=release/modules/ext/tika-parsers-1.17.jar
|
file.reference.tika-parsers-1.17.jar=release/modules/ext/tika-parsers-1.17.jar
|
||||||
file.reference.curator-client-2.8.0.jar=release/modules/ext/curator-client-2.8.0.jar
|
file.reference.curator-client-2.8.0.jar=release/modules/ext/curator-client-2.8.0.jar
|
||||||
file.reference.curator-framework-2.8.0.jar=release/modules/ext/curator-framework-2.8.0.jar
|
file.reference.curator-framework-2.8.0.jar=release/modules/ext/curator-framework-2.8.0.jar
|
||||||
file.reference.curator-recipes-2.8.0.jar=release/modules/ext/curator-recipes-2.8.0.jar
|
file.reference.curator-recipes-2.8.0.jar=release/modules/ext/curator-recipes-2.8.0.jar
|
||||||
|
file.reference.vorbis-java-core-0.8.jar=release/modules/ext/vorbis-java-core-0.8.jar
|
||||||
|
file.reference.vorbis-java-tika-0.8.jar=release/modules/ext/vorbis-java-tika-0.8.jar
|
||||||
file.reference.xmpcore-5.1.3.jar=release/modules/ext/xmpcore-5.1.3.jar
|
file.reference.xmpcore-5.1.3.jar=release/modules/ext/xmpcore-5.1.3.jar
|
||||||
file.reference.xz-1.6.jar=release/modules/ext/xz-1.6.jar
|
file.reference.xz-1.6.jar=release/modules/ext/xz-1.6.jar
|
||||||
file.reference.zookeeper-3.4.6.jar=release/modules/ext/zookeeper-3.4.6.jar
|
file.reference.zookeeper-3.4.6.jar=release/modules/ext/zookeeper-3.4.6.jar
|
||||||
|
@ -338,81 +338,59 @@
|
|||||||
<package>org.sleuthkit.autopsy.modules.vmextractor</package>
|
<package>org.sleuthkit.autopsy.modules.vmextractor</package>
|
||||||
<package>org.sleuthkit.autopsy.progress</package>
|
<package>org.sleuthkit.autopsy.progress</package>
|
||||||
<package>org.sleuthkit.autopsy.report</package>
|
<package>org.sleuthkit.autopsy.report</package>
|
||||||
|
<package>org.sleuthkit.autopsy.textextractors</package>
|
||||||
|
<package>org.sleuthkit.autopsy.textextractors.extractionconfigs</package>
|
||||||
<package>org.sleuthkit.autopsy.texttranslation</package>
|
<package>org.sleuthkit.autopsy.texttranslation</package>
|
||||||
<package>org.sleuthkit.datamodel</package>
|
<package>org.sleuthkit.datamodel</package>
|
||||||
</public-packages>
|
</public-packages>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/apache-mime4j-dom-0.8.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/apache-mime4j-dom-0.8.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/jackcess-2.2.0.jar</runtime-relative-path>
|
<runtime-relative-path>ext/jackcess-2.2.0.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/jackcess-2.2.0.jar</binary-origin>
|
<binary-origin>release/modules/ext/jackcess-2.2.0.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path>
|
<runtime-relative-path>ext/jericho-html-3.3.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin>
|
<binary-origin>release/modules/ext/jericho-html-3.3.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/jdom-2.0.5.jar</runtime-relative-path>
|
<runtime-relative-path>ext/cdm-4.5.5.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/jdom-2.0.5.jar</binary-origin>
|
<binary-origin>release/modules/ext/cdm-4.5.5.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/cxf-rt-transports-http-3.0.16.jar</runtime-relative-path>
|
<runtime-relative-path>ext/httpservices-4.5.5.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/cxf-rt-transports-http-3.0.16.jar</binary-origin>
|
<binary-origin>release/modules/ext/httpservices-4.5.5.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/commons-validator-1.6.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-validator-1.6.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/commons-validator-1.6.jar</binary-origin>
|
<binary-origin>release/modules/ext/commons-validator-1.6.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/curator-framework-2.8.0.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/curator-framework-2.8.0.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/bcprov-jdk15on-1.54.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/bcprov-jdk15on-1.54.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/commons-compress-1.14.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-compress-1.14.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/commons-compress-1.14.jar</binary-origin>
|
<binary-origin>release/modules/ext/commons-compress-1.14.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/fontbox-2.0.8.jar</runtime-relative-path>
|
<runtime-relative-path>ext/geoapi-3.0.0.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/fontbox-2.0.8.jar</binary-origin>
|
<binary-origin>release/modules/ext/geoapi-3.0.0.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/commons-dbcp2-2.1.1.jar</runtime-relative-path>
|
<runtime-relative-path>ext/boilerpipe-1.1.0.jar</runtime-relative-path>
|
||||||
<binary-origin>release\modules\ext\commons-dbcp2-2.1.1.jar</binary-origin>
|
<binary-origin>release/modules/ext/boilerpipe-1.1.0.jar</binary-origin>
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/jgraphx-v3.8.0.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/jgraphx-v3.8.0.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/jython-standalone-2.7.0.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/jython-standalone-2.7.0.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/sevenzipjbinding.jar</runtime-relative-path>
|
<runtime-relative-path>ext/sevenzipjbinding.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/sevenzipjbinding.jar</binary-origin>
|
<binary-origin>release/modules/ext/sevenzipjbinding.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path>
|
<runtime-relative-path>ext/bcmail-jdk15on-1.54.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin>
|
<binary-origin>release/modules/ext/bcmail-jdk15on-1.54.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/mchange-commons-java-0.2.9.jar</runtime-relative-path>
|
<runtime-relative-path>ext/mchange-commons-java-0.2.9.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/mchange-commons-java-0.2.9.jar</binary-origin>
|
<binary-origin>release/modules/ext/mchange-commons-java-0.2.9.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/cxf-core-3.0.16.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/cxf-core-3.0.16.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/javax.ws.rs-api-2.0.1.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/javax.ws.rs-api-2.0.1.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/postgresql-9.4.1211.jre7.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/postgresql-9.4.1211.jre7.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/curator-recipes-2.8.0.jar</runtime-relative-path>
|
<runtime-relative-path>ext/curator-recipes-2.8.0.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/curator-recipes-2.8.0.jar</binary-origin>
|
<binary-origin>release/modules/ext/curator-recipes-2.8.0.jar</binary-origin>
|
||||||
@ -421,6 +399,14 @@
|
|||||||
<runtime-relative-path>ext/metadata-extractor-2.10.1.jar</runtime-relative-path>
|
<runtime-relative-path>ext/metadata-extractor-2.10.1.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/metadata-extractor-2.10.1.jar</binary-origin>
|
<binary-origin>release/modules/ext/metadata-extractor-2.10.1.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/apache-mime4j-core-0.8.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/apache-mime4j-core-0.8.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/tagsoup-1.2.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/tagsoup-1.2.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/tika-core-1.17.jar</runtime-relative-path>
|
<runtime-relative-path>ext/tika-core-1.17.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/tika-core-1.17.jar</binary-origin>
|
<binary-origin>release/modules/ext/tika-core-1.17.jar</binary-origin>
|
||||||
@ -429,45 +415,37 @@
|
|||||||
<runtime-relative-path>ext/StixLib.jar</runtime-relative-path>
|
<runtime-relative-path>ext/StixLib.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/StixLib.jar</binary-origin>
|
<binary-origin>release/modules/ext/StixLib.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/curator-client-2.8.0.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/curator-client-2.8.0.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/jackson-core-2.9.7.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/jackson-core-2.9.7.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/cxf-rt-frontend-jaxrs-3.0.16.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/pdfbox-tools-2.0.8.jar</runtime-relative-path>
|
<runtime-relative-path>ext/pdfbox-tools-2.0.8.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/pdfbox-tools-2.0.8.jar</binary-origin>
|
<binary-origin>release/modules/ext/pdfbox-tools-2.0.8.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/asm-5.0.4.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/asm-5.0.4.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jcl-over-slf4j-1.7.24.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jcl-over-slf4j-1.7.24.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/tika-parsers-1.17.jar</runtime-relative-path>
|
<runtime-relative-path>ext/tika-parsers-1.17.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/tika-parsers-1.17.jar</binary-origin>
|
<binary-origin>release/modules/ext/tika-parsers-1.17.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/sqlite-jdbc-3.8.11.jar</runtime-relative-path>
|
<runtime-relative-path>ext/sqlite-jdbc-3.8.11.jar</runtime-relative-path>
|
||||||
<binary-origin>release\modules\ext\sqlite-jdbc-3.8.11.jar</binary-origin>
|
<binary-origin>release/modules/ext/sqlite-jdbc-3.8.11.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/activemq-all-5.11.1.jar</runtime-relative-path>
|
<runtime-relative-path>ext/json-simple-1.1.1.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/activemq-all-5.11.1.jar</binary-origin>
|
<binary-origin>release/modules/ext/json-simple-1.1.1.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/xz-1.6.jar</runtime-relative-path>
|
<runtime-relative-path>ext/sis-utility-0.6.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/xz-1.6.jar</binary-origin>
|
<binary-origin>release/modules/ext/sis-utility-0.6.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/Rejistry-1.0-SNAPSHOT.jar</runtime-relative-path>
|
<runtime-relative-path>ext/jhighlight-1.0.2.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/Rejistry-1.0-SNAPSHOT.jar</binary-origin>
|
<binary-origin>release/modules/ext/jhighlight-1.0.2.jar</binary-origin>
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/dd-plist-1.20.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/dd-plist-1.20.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/jempbox-1.8.13.jar</runtime-relative-path>
|
<runtime-relative-path>ext/jempbox-1.8.13.jar</runtime-relative-path>
|
||||||
@ -477,21 +455,9 @@
|
|||||||
<runtime-relative-path>ext/cxf-rt-rs-client-3.0.16.jar</runtime-relative-path>
|
<runtime-relative-path>ext/cxf-rt-rs-client-3.0.16.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/cxf-rt-rs-client-3.0.16.jar</binary-origin>
|
<binary-origin>release/modules/ext/cxf-rt-rs-client-3.0.16.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/sevenzipjbinding-AllPlatforms.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/sevenzipjbinding-AllPlatforms.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/commons-pool2-2.4.2.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-pool2-2.4.2.jar</runtime-relative-path>
|
||||||
<binary-origin>release\modules\ext\commons-pool2-2.4.2.jar</binary-origin>
|
<binary-origin>release/modules/ext/commons-pool2-2.4.2.jar</binary-origin>
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/jsoup-1.10.3.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/jsoup-1.10.3.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/jdom-2.0.5-contrib.jar</runtime-relative-path>
|
<runtime-relative-path>ext/jdom-2.0.5-contrib.jar</runtime-relative-path>
|
||||||
@ -513,6 +479,190 @@
|
|||||||
<runtime-relative-path>ext/xmpcore-5.1.3.jar</runtime-relative-path>
|
<runtime-relative-path>ext/xmpcore-5.1.3.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/xmpcore-5.1.3.jar</binary-origin>
|
<binary-origin>release/modules/ext/xmpcore-5.1.3.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/zookeeper-3.4.6.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/zookeeper-3.4.6.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jdom-2.0.5.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jdom-2.0.5.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/cxf-rt-transports-http-3.0.16.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/cxf-rt-transports-http-3.0.16.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/sis-metadata-0.6.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/sis-metadata-0.6.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/isoparser-1.1.18.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/isoparser-1.1.18.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/sleuthkit-postgresql-4.6.4.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/sleuthkit-postgresql-4.6.4.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/vorbis-java-core-0.8.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/vorbis-java-core-0.8.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/commons-codec-1.6.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/commons-codec-1.6.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/netcdf4-4.5.5.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/netcdf4-4.5.5.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/slf4j-api-1.7.24.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/slf4j-api-1.7.24.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/java-libpst-0.8.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/java-libpst-0.8.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jul-to-slf4j-1.7.24.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jul-to-slf4j-1.7.24.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/gson-2.8.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/gson-2.8.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/poi-3.17.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/poi-3.17.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/poi-scratchpad-3.17.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/poi-scratchpad-3.17.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/sis-netcdf-0.6.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/sis-netcdf-0.6.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/commons-io-2.5.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/commons-io-2.5.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/curator-framework-2.8.0.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/curator-framework-2.8.0.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/bcprov-jdk15on-1.54.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/bcprov-jdk15on-1.54.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/fontbox-2.0.8.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/fontbox-2.0.8.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/commons-dbcp2-2.1.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/commons-dbcp2-2.1.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jgraphx-v3.8.0.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jgraphx-v3.8.0.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/juniversalchardet-1.0.3.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/juniversalchardet-1.0.3.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jython-standalone-2.7.0.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jython-standalone-2.7.0.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jackcess-encrypt-2.1.4.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jackcess-encrypt-2.1.4.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/cxf-core-3.0.16.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/cxf-core-3.0.16.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/javax.ws.rs-api-2.0.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/javax.ws.rs-api-2.0.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/opennlp-tools-1.8.3.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/opennlp-tools-1.8.3.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/junrar-0.7.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/junrar-0.7.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/postgresql-9.4.1211.jre7.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/postgresql-9.4.1211.jre7.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/poi-ooxml-3.17.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/poi-ooxml-3.17.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/curator-client-2.8.0.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/curator-client-2.8.0.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jackson-core-2.9.7.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jackson-core-2.9.7.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/cxf-rt-frontend-jaxrs-3.0.16.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/cxf-rt-frontend-jaxrs-3.0.16.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/grib-4.5.5.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/grib-4.5.5.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jackson-core-2.9.2.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jackson-core-2.9.2.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/activemq-all-5.11.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/activemq-all-5.11.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/xz-1.6.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/xz-1.6.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/Rejistry-1.0-SNAPSHOT.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/Rejistry-1.0-SNAPSHOT.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/dd-plist-1.20.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/dd-plist-1.20.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/rome-1.5.1.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/rome-1.5.1.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/sevenzipjbinding-AllPlatforms.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/sevenzipjbinding-AllPlatforms.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jmatio-1.2.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jmatio-1.2.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jsoup-1.10.3.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jsoup-1.10.3.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/vorbis-java-tika-0.8.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/json-1.8.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/json-1.8.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
</data>
|
</data>
|
||||||
</configuration>
|
</configuration>
|
||||||
</project>
|
</project>
|
||||||
|
@ -0,0 +1,89 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2011-2018 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||||
|
import org.sleuthkit.datamodel.Content;
|
||||||
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts text from artifacts by concatenating the values of all of the
|
||||||
|
* artifact's attributes.
|
||||||
|
*/
|
||||||
|
class ArtifactTextExtractor extends TextExtractor {
|
||||||
|
|
||||||
|
private final BlackboardArtifact artifact;
|
||||||
|
|
||||||
|
public ArtifactTextExtractor(Content artifact) {
|
||||||
|
this.artifact = (BlackboardArtifact) artifact;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader getReader() throws ExtractionException {
|
||||||
|
// Concatenate the string values of all attributes into a single
|
||||||
|
// "content" string to be indexed.
|
||||||
|
StringBuilder artifactContents = new StringBuilder();
|
||||||
|
|
||||||
|
Content dataSource = null;
|
||||||
|
try {
|
||||||
|
dataSource = artifact.getDataSource();
|
||||||
|
} catch (TskCoreException tskCoreException) {
|
||||||
|
throw new ExtractionException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
|
||||||
|
}
|
||||||
|
if (dataSource == null) {
|
||||||
|
throw new ExtractionException("Datasource was null for artifact: " + artifact.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||||
|
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||||
|
artifactContents.append(" : ");
|
||||||
|
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||||
|
// to magically format datetime attributes but that is complicated by
|
||||||
|
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
||||||
|
// while the utility to determine the timezone to use is in ContentUtils
|
||||||
|
// in the Autopsy datamodel.
|
||||||
|
switch (attribute.getValueType()) {
|
||||||
|
case DATETIME:
|
||||||
|
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
artifactContents.append(attribute.getDisplayString());
|
||||||
|
}
|
||||||
|
artifactContents.append(System.lineSeparator());
|
||||||
|
}
|
||||||
|
} catch (TskCoreException tskCoreException) {
|
||||||
|
throw new ExtractionException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new InputStreamReader(IOUtils.toInputStream(artifactContents,
|
||||||
|
StandardCharsets.UTF_8), StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isSupported(Content file, String detectedFormat) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
@ -16,7 +16,7 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
@ -38,10 +38,11 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
/**
|
/**
|
||||||
* Extracts text from HTML content.
|
* Extracts text from HTML content.
|
||||||
*/
|
*/
|
||||||
class HtmlTextExtractor extends ContentTextExtractor {
|
final class HtmlTextExtractor extends TextExtractor {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||||
private static final int MAX_SIZE = 50_000_000; //50MB
|
private final int MAX_SIZE;
|
||||||
|
private final Content file;
|
||||||
|
|
||||||
static final List<String> WEB_MIME_TYPES = Arrays.asList(
|
static final List<String> WEB_MIME_TYPES = Arrays.asList(
|
||||||
"application/javascript", //NON-NLS
|
"application/javascript", //NON-NLS
|
||||||
@ -51,27 +52,51 @@ class HtmlTextExtractor extends ContentTextExtractor {
|
|||||||
"text/html", //NON-NLS NON-NLS
|
"text/html", //NON-NLS NON-NLS
|
||||||
"text/javascript" //NON-NLS
|
"text/javascript" //NON-NLS
|
||||||
);
|
);
|
||||||
|
|
||||||
static {
|
static {
|
||||||
// Disable Jericho HTML Parser log messages.
|
// Disable Jericho HTML Parser log messages.
|
||||||
Config.LoggerProvider = LoggerProvider.DISABLED;
|
Config.LoggerProvider = LoggerProvider.DISABLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
/**
|
||||||
boolean isContentTypeSpecific() {
|
* Creates a default instance of the HtmlTextExtractor. Supported file size
|
||||||
return true;
|
* is 50MB.
|
||||||
|
*/
|
||||||
|
public HtmlTextExtractor(Content file) {
|
||||||
|
//Set default to be 50 MB.
|
||||||
|
MAX_SIZE = 50_000_000;
|
||||||
|
this.file = file;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if this content type is supported by this extractor.
|
||||||
|
*
|
||||||
|
* @param content Content instance to be analyzed
|
||||||
|
* @param detectedFormat Mimetype of content instance
|
||||||
|
*
|
||||||
|
* @return flag indicating support
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
boolean isSupported(Content content, String detectedFormat) {
|
public boolean isSupported(Content content, String detectedFormat) {
|
||||||
return detectedFormat != null
|
return detectedFormat != null
|
||||||
&& WEB_MIME_TYPES.contains(detectedFormat)
|
&& WEB_MIME_TYPES.contains(detectedFormat)
|
||||||
&& content.getSize() <= MAX_SIZE;
|
&& content.getSize() <= MAX_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a reader that will iterate over the text of an HTML document.
|
||||||
|
*
|
||||||
|
* @param content Html document source
|
||||||
|
*
|
||||||
|
* @return A reader instance containing the document source text
|
||||||
|
*
|
||||||
|
* @throws TextExtractorException
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(Content content) throws TextExtractorException {
|
public Reader getReader() throws ExtractionException {
|
||||||
ReadContentInputStream stream = new ReadContentInputStream(content);
|
//TODO JIRA-4467, there is only harm in excluding HTML documents greater
|
||||||
|
//than 50MB due to our troubled approach of extraction.
|
||||||
|
ReadContentInputStream stream = new ReadContentInputStream(file);
|
||||||
|
|
||||||
//Parse the stream with Jericho and put the results in a Reader
|
//Parse the stream with Jericho and put the results in a Reader
|
||||||
try {
|
try {
|
||||||
@ -164,17 +189,8 @@ class HtmlTextExtractor extends ContentTextExtractor {
|
|||||||
// All done, now make it a reader
|
// All done, now make it a reader
|
||||||
return new StringReader(stringBuilder.toString());
|
return new StringReader(stringBuilder.toString());
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
throw new TextExtractorException("Error extracting HTML from content.", ex);
|
logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
|
||||||
|
throw new ExtractionException("Error extracting HTML from content.", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(final String msg, Exception ex) {
|
|
||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
|
||||||
}
|
|
||||||
}
|
}
|
@ -1,24 +1,23 @@
|
|||||||
/*
|
/*
|
||||||
* Autopsy Forensic Browser
|
* Autopsy Forensic Browser
|
||||||
*
|
*
|
||||||
* Copyright 2018-2018 Basis Technology Corp.
|
* Copyright 2018-2018 Basis Technology Corp.
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
* You may obtain a copy of the License at
|
* You may obtain a copy of the License at
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
import com.google.common.io.CharSource;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
@ -28,37 +27,27 @@ import java.util.logging.Level;
|
|||||||
import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
|
import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
|
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.Content;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dedicated SqliteTextExtractor to solve the problems associated with Tika's
|
* Extracts text from SQLite database files.
|
||||||
* Sqlite parser.
|
|
||||||
*
|
*
|
||||||
* Tika problems: 1) Tika fails to open virtual tables 2) Tika fails to open
|
* This is a dedicated solution to address the problems associated with
|
||||||
* tables with spaces in table name 3) Tika fails to include the table names in
|
* Tika's sqlite parser (version 1.17), which include the following:
|
||||||
* output (except for the first table it parses)
|
* 1) Virtual tables cause the parser to bail
|
||||||
|
* 2) Tables that contain spaces in their name are not extracted
|
||||||
|
* 3) Table names are not included in its output text
|
||||||
*/
|
*/
|
||||||
class SqliteTextExtractor extends ContentTextExtractor {
|
final class SqliteTextExtractor extends TextExtractor {
|
||||||
|
|
||||||
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
||||||
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
||||||
|
private final AbstractFile file;
|
||||||
|
|
||||||
@Override
|
public SqliteTextExtractor(Content file) {
|
||||||
boolean isContentTypeSpecific() {
|
this.file = (AbstractFile) file;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(String msg, Exception exception) {
|
|
||||||
logger.log(Level.WARNING, msg, exception); //NON-NLS
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Supports only the sqlite mimetypes
|
* Supports only the sqlite mimetypes
|
||||||
*
|
*
|
||||||
@ -68,44 +57,34 @@ class SqliteTextExtractor extends ContentTextExtractor {
|
|||||||
* @return true if x-sqlite3
|
* @return true if x-sqlite3
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
boolean isSupported(Content file, String detectedFormat) {
|
public boolean isSupported(Content file, String detectedFormat) {
|
||||||
return SQLITE_MIMETYPE.equals(detectedFormat);
|
return SQLITE_MIMETYPE.equals(detectedFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a stream that will read from a sqlite database.
|
* Returns a reader that will iterate over the text of a sqlite database.
|
||||||
*
|
*
|
||||||
* @param source Content file
|
* @param source Content file
|
||||||
*
|
*
|
||||||
* @return An InputStream that reads from a Sqlite database.
|
* @return An InputStream that reads from a Sqlite database
|
||||||
*
|
*
|
||||||
* @throws
|
* @throws TextExtractorException
|
||||||
* org.sleuthkit.autopsy.keywordsearch.TextExtractor.TextExtractorException
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(Content source) throws TextExtractorException {
|
public Reader getReader() throws ExtractionException {
|
||||||
//Firewall for any content that is not an AbstractFile
|
return new SQLiteStreamReader(file);
|
||||||
if (!AbstractFile.class.isInstance(source)) {
|
|
||||||
try {
|
|
||||||
return CharSource.wrap("").openStream();
|
|
||||||
} catch (IOException ex) {
|
|
||||||
throw new TextExtractorException("", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new SQLiteStreamReader((AbstractFile) source);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Produces a continuous stream of characters from a database file. To
|
* Produces a continuous stream of characters from a database file. To
|
||||||
* achieve this, all table names are queues up and a SQLiteTableReader is
|
* achieve this, all table names are queues up and a SQLiteTableReader is
|
||||||
* used to do the actual queries and table iteration.
|
* used to do the actual queries and table iteration.
|
||||||
*/
|
*/
|
||||||
public class SQLiteStreamReader extends Reader {
|
private class SQLiteStreamReader extends Reader {
|
||||||
|
|
||||||
private final SQLiteTableReader reader;
|
private final SQLiteTableReader reader;
|
||||||
private final AbstractFile file;
|
private final AbstractFile file;
|
||||||
|
|
||||||
private Iterator<String> tableNames;
|
private Iterator<String> tableNames;
|
||||||
private String currentTableName;
|
private String currentTableName;
|
||||||
|
|
||||||
@ -217,9 +196,10 @@ class SqliteTextExtractor extends ContentTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads database values into the buffer. This function is responsible for
|
* Reads database values into the buffer. This function is responsible
|
||||||
* getting the next table in the queue, initiating calls to the SQLiteTableReader,
|
* for getting the next table in the queue, initiating calls to the
|
||||||
* and filling in any excess bytes that are lingering from the previous call.
|
* SQLiteTableReader, and filling in any excess bytes that are lingering
|
||||||
|
* from the previous call.
|
||||||
*
|
*
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
@ -255,9 +235,9 @@ class SqliteTextExtractor extends ContentTextExtractor {
|
|||||||
reader.read(currentTableName, () -> bufIndex == len);
|
reader.read(currentTableName, () -> bufIndex == len);
|
||||||
} catch (SQLiteTableReaderException ex) {
|
} catch (SQLiteTableReaderException ex) {
|
||||||
logger.log(Level.WARNING, String.format(
|
logger.log(Level.WARNING, String.format(
|
||||||
"Error attempting to read file table: [%s]" //NON-NLS
|
"Error attempting to read file table: [%s]" //NON-NLS
|
||||||
+ " for file: [%s] (id=%d).", currentTableName, //NON-NLS
|
+ " for file: [%s] (id=%d).", currentTableName, //NON-NLS
|
||||||
file.getName(), file.getId()), ex.getMessage());
|
file.getName(), file.getId()), ex.getMessage());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (bufIndex == off) {
|
if (bufIndex == off) {
|
||||||
@ -290,8 +270,8 @@ class SqliteTextExtractor extends ContentTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper that holds the excess bytes that were left over from the previous
|
* Wrapper that holds the excess bytes that were left over from the
|
||||||
* call to read().
|
* previous call to read().
|
||||||
*/
|
*/
|
||||||
private class ExcessBytes {
|
private class ExcessBytes {
|
||||||
|
|
@ -16,19 +16,19 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Objects;
|
||||||
import java.util.logging.Level;
|
import org.openide.util.Lookup;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
|
||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
import org.sleuthkit.datamodel.TskException;
|
import org.sleuthkit.datamodel.TskException;
|
||||||
@ -36,24 +36,25 @@ import org.sleuthkit.datamodel.TskException;
|
|||||||
/**
|
/**
|
||||||
* Extracts raw strings from content.
|
* Extracts raw strings from content.
|
||||||
*/
|
*/
|
||||||
class StringsTextExtractor extends ContentTextExtractor {
|
final class StringsTextExtractor extends TextExtractor {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
private boolean extractUTF8;
|
||||||
|
private boolean extractUTF16;
|
||||||
/**
|
private final Content content;
|
||||||
* Options for this extractor
|
private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
|
||||||
*/
|
|
||||||
enum ExtractOptions {
|
|
||||||
EXTRACT_UTF16, ///< extract UTF16 text, true/false
|
|
||||||
EXTRACT_UTF8, ///< extract UTF8 text, true/false
|
|
||||||
};
|
|
||||||
|
|
||||||
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
||||||
private Map<String, String> extractOptions = new HashMap<>();
|
|
||||||
|
|
||||||
public StringsTextExtractor() {
|
/**
|
||||||
|
* Creates a default StringsTextExtractor instance. The instance will be
|
||||||
|
* configured to run only LATIN_2 as its default extraction script and UTF-8
|
||||||
|
* as its default encoding.
|
||||||
|
*/
|
||||||
|
public StringsTextExtractor(Content content) {
|
||||||
//LATIN_2 is the default script
|
//LATIN_2 is the default script
|
||||||
extractScripts.add(SCRIPT.LATIN_2);
|
extractScripts.add(SCRIPT.LATIN_2);
|
||||||
|
extractUTF8 = true;
|
||||||
|
this.content = content;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -61,56 +62,29 @@ class StringsTextExtractor extends ContentTextExtractor {
|
|||||||
*
|
*
|
||||||
* @param extractScripts scripts to use
|
* @param extractScripts scripts to use
|
||||||
*/
|
*/
|
||||||
public void setScripts(List<SCRIPT> extractScripts) {
|
public final void setScripts(List<SCRIPT> extractScripts) {
|
||||||
|
if (extractScripts == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
this.extractScripts.clear();
|
this.extractScripts.clear();
|
||||||
this.extractScripts.addAll(extractScripts);
|
this.extractScripts.addAll(extractScripts);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the currently used scripts for extraction
|
* Returns a reader that will iterate over the text of the content source.
|
||||||
*
|
*
|
||||||
* @return scripts currently used or null if not supported
|
* @param content Content source of any type
|
||||||
*/
|
|
||||||
public List<SCRIPT> getScripts() {
|
|
||||||
return new ArrayList<>(extractScripts);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get current options
|
|
||||||
*
|
*
|
||||||
* @return currently used, extractor specific options, or null of not
|
* @return A reader instance that content text can be obtained from
|
||||||
* supported
|
|
||||||
*/
|
|
||||||
public Map<String, String> getOptions() {
|
|
||||||
return extractOptions;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set extractor specific options
|
|
||||||
*
|
*
|
||||||
* @param options options to use
|
* @throws
|
||||||
|
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
||||||
*/
|
*/
|
||||||
public void setOptions(Map<String, String> options) {
|
|
||||||
this.extractOptions = options;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void logWarning(final String msg, Exception ex) {
|
public InputStreamReader getReader() {
|
||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
|
|
||||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
|
|
||||||
|
|
||||||
return extractUTF8 == false && extractUTF16 == false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public InputStreamReader getReader(Content content) throws TextExtractorException {
|
|
||||||
InputStream stringStream = getInputStream(content);
|
InputStream stringStream = getInputStream(content);
|
||||||
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
|
||||||
}
|
}
|
||||||
|
|
||||||
InputStream getInputStream(Content content) {
|
InputStream getInputStream(Content content) {
|
||||||
@ -118,27 +92,55 @@ class StringsTextExtractor extends ContentTextExtractor {
|
|||||||
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
|
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
|
||||||
return new EnglishOnlyStream(content);//optimal for english, english only
|
return new EnglishOnlyStream(content);//optimal for english, english only
|
||||||
} else {
|
} else {
|
||||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
|
|
||||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
|
|
||||||
|
|
||||||
return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
|
return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines how the extraction process will proceed given the settings
|
||||||
|
* stored in this context instance.
|
||||||
|
*
|
||||||
|
* See the DefaultExtractionConfig class in the extractionconfigs package
|
||||||
|
* for available settings.
|
||||||
|
*
|
||||||
|
* @param context Lookup instance containing config classes
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isContentTypeSpecific() {
|
public void setExtractionSettings(Lookup context) {
|
||||||
return false;
|
if (context != null) {
|
||||||
}
|
DefaultExtractionConfig configInstance = context.lookup(DefaultExtractionConfig.class);
|
||||||
|
if (configInstance == null) {
|
||||||
@Override
|
return;
|
||||||
public boolean isSupported(Content content, String detectedFormat) {
|
}
|
||||||
// strings can be run on anything.
|
if (Objects.nonNull(configInstance.getExtractUTF8())) {
|
||||||
return true;
|
extractUTF8 = configInstance.getExtractUTF8();
|
||||||
|
}
|
||||||
|
if (Objects.nonNull(configInstance.getExtractUTF16())) {
|
||||||
|
extractUTF16 = configInstance.getExtractUTF16();
|
||||||
|
}
|
||||||
|
if (Objects.nonNull(configInstance.getExtractScripts())) {
|
||||||
|
setScripts(configInstance.getExtractScripts());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Content input string stream reader/converter - given Content,
|
*
|
||||||
* extract strings from it and return encoded bytes via read()
|
* @return
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean isEnabled() {
|
||||||
|
return extractUTF8 || extractUTF16;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
boolean isSupported(Content file, String detectedFormat) {
|
||||||
|
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Content input string stream reader/converter - given Content, extract
|
||||||
|
* strings from it and return encoded bytes via read()
|
||||||
*
|
*
|
||||||
* Note: the utility supports extraction of only LATIN script and UTF8,
|
* Note: the utility supports extraction of only LATIN script and UTF8,
|
||||||
* UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
|
* UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
|
||||||
@ -150,7 +152,6 @@ class StringsTextExtractor extends ContentTextExtractor {
|
|||||||
*/
|
*/
|
||||||
private static class EnglishOnlyStream extends InputStream {
|
private static class EnglishOnlyStream extends InputStream {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
|
|
||||||
private static final String NLS = Character.toString((char) 10); //new line
|
private static final String NLS = Character.toString((char) 10); //new line
|
||||||
private static final int READ_BUF_SIZE = 65536;
|
private static final int READ_BUF_SIZE = 65536;
|
||||||
private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
|
private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
|
||||||
@ -244,12 +245,7 @@ class StringsTextExtractor extends ContentTextExtractor {
|
|||||||
}
|
}
|
||||||
//get char from cur read buf
|
//get char from cur read buf
|
||||||
char c = (char) curReadBuf[readBufOffset++];
|
char c = (char) curReadBuf[readBufOffset++];
|
||||||
if (c == 0 && singleConsecZero == false) {
|
singleConsecZero = c == 0 && singleConsecZero == false; //preserve the current sequence if max consec. 1 zero char
|
||||||
//preserve the current sequence if max consec. 1 zero char
|
|
||||||
singleConsecZero = true;
|
|
||||||
} else {
|
|
||||||
singleConsecZero = false;
|
|
||||||
}
|
|
||||||
if (StringExtract.isPrintableAscii(c)) {
|
if (StringExtract.isPrintableAscii(c)) {
|
||||||
tempString.append(c);
|
tempString.append(c);
|
||||||
++tempStringLen;
|
++tempStringLen;
|
||||||
@ -328,7 +324,7 @@ class StringsTextExtractor extends ContentTextExtractor {
|
|||||||
private int copyToReturn(byte[] b, int off, long len) {
|
private int copyToReturn(byte[] b, int off, long len) {
|
||||||
final String curStringS = curString.toString();
|
final String curStringS = curString.toString();
|
||||||
//logger.log(Level.INFO, curStringS);
|
//logger.log(Level.INFO, curStringS);
|
||||||
byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
|
||||||
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
|
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
|
||||||
//logger.log(Level.INFO, curStringS);
|
//logger.log(Level.INFO, curStringS);
|
||||||
//copied all string, reset
|
//copied all string, reset
|
||||||
@ -370,7 +366,6 @@ class StringsTextExtractor extends ContentTextExtractor {
|
|||||||
*/
|
*/
|
||||||
private static class InternationalStream extends InputStream {
|
private static class InternationalStream extends InputStream {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
|
|
||||||
private static final int FILE_BUF_SIZE = 1024 * 1024;
|
private static final int FILE_BUF_SIZE = 1024 * 1024;
|
||||||
private final Content content;
|
private final Content content;
|
||||||
private final byte[] oneCharBuf = new byte[1];
|
private final byte[] oneCharBuf = new byte[1];
|
||||||
@ -499,7 +494,7 @@ class StringsTextExtractor extends ContentTextExtractor {
|
|||||||
*/
|
*/
|
||||||
private void convert(int numBytes) {
|
private void convert(int numBytes) {
|
||||||
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
|
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
|
||||||
convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
convertBuff = lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
|
||||||
//reset tracking vars
|
//reset tracking vars
|
||||||
if (lastExtractResult.getNumBytes() == 0) {
|
if (lastExtractResult.getNumBytes() == 0) {
|
||||||
bytesInConvertBuff = 0;
|
bytesInConvertBuff = 0;
|
103
Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
Normal file
103
Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2011-18 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import org.openide.util.Lookup;
|
||||||
|
import org.sleuthkit.datamodel.Content;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the text out of {@link org.sleuthkit.datamodel.Content} instances
|
||||||
|
* and exposes them as a {@link java.io.Reader}. Concrete implementations can be
|
||||||
|
* obtained from
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}
|
||||||
|
* or
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
|
||||||
|
*
|
||||||
|
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
|
||||||
|
*/
|
||||||
|
public abstract class TextExtractor {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the file content is supported by the extractor.
|
||||||
|
*
|
||||||
|
* @param file to test if its content should be supported
|
||||||
|
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||||
|
* or null if not detected
|
||||||
|
*
|
||||||
|
* @return true if the file content is supported, false otherwise
|
||||||
|
*/
|
||||||
|
abstract boolean isSupported(Content file, String detectedFormat);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the TextExtractor instance is enabled to read content.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
boolean isEnabled() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a {@link java.io.Reader} that will iterate over the text extracted
|
||||||
|
* from the {@link org.sleuthkit.datamodel.Content} passed into
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}.
|
||||||
|
*
|
||||||
|
* @return {@link java.io.Reader} that contains the text of the underlying
|
||||||
|
* {@link org.sleuthkit.datamodel.Content}
|
||||||
|
*
|
||||||
|
* @throws
|
||||||
|
* org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
|
||||||
|
*
|
||||||
|
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public abstract Reader getReader() throws ExtractionException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines how the extraction process will proceed given the settings
|
||||||
|
* stored in the context instance.
|
||||||
|
*
|
||||||
|
* @param context Instance containing file config classes
|
||||||
|
*/
|
||||||
|
void setExtractionSettings(Lookup context) {
|
||||||
|
//no-op by default
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Exception encountered during
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
|
||||||
|
* This indicates that there was an internal parsing error that occurred
|
||||||
|
* during the reading of Content text.
|
||||||
|
*/
|
||||||
|
public class ExtractionException extends Exception {
|
||||||
|
|
||||||
|
public ExtractionException(String msg, Throwable ex) {
|
||||||
|
super(msg, ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExtractionException(Throwable ex) {
|
||||||
|
super(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExtractionException(String msg) {
|
||||||
|
super(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
157
Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
Executable file
157
Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
Executable file
@ -0,0 +1,157 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2018-2018 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import org.openide.util.Lookup;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
|
import org.sleuthkit.datamodel.Content;
|
||||||
|
import org.sleuthkit.datamodel.Report;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for creating
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor}'s given a
|
||||||
|
* {@link org.sleuthkit.datamodel.Content} instance
|
||||||
|
*
|
||||||
|
* See {@link org.sleuthkit.autopsy.textextractors.extractionconfigs} for
|
||||||
|
* available {@link org.sleuthkit.autopsy.textextractors.TextExtractor}
|
||||||
|
* configuration options.
|
||||||
|
*
|
||||||
|
* @see org.openide.util.Lookup
|
||||||
|
*/
|
||||||
|
public class TextExtractorFactory {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Auto detects the correct
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor} given the
|
||||||
|
* {@link org.sleuthkit.datamodel.Content}.
|
||||||
|
*
|
||||||
|
* See {@link org.sleuthkit.autopsy.textextractors.extractionconfigs} for
|
||||||
|
* available {@link org.sleuthkit.autopsy.textextractors.TextExtractor}
|
||||||
|
* configuration options.
|
||||||
|
*
|
||||||
|
* @param content Content source that will be read from
|
||||||
|
* @param context Contains extraction configurations for certain file types
|
||||||
|
*
|
||||||
|
* @return A TextExtractor that supports the given content. File text can be
|
||||||
|
* obtained from
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
|
||||||
|
*
|
||||||
|
* @throws NoTextExtractorFound Encountered when there is no TextExtractor
|
||||||
|
* was found for the given content type. Use {@link
|
||||||
|
* TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
|
||||||
|
* org.openide.util.Lookup)}
|
||||||
|
*
|
||||||
|
* @see org.openide.util.Lookup
|
||||||
|
*/
|
||||||
|
public static TextExtractor getExtractor(Content content,
|
||||||
|
Lookup context) throws NoTextExtractorFound {
|
||||||
|
if (content instanceof AbstractFile) {
|
||||||
|
String mimeType = ((AbstractFile) content).getMIMEType();
|
||||||
|
List<TextExtractor> extractors = Arrays.asList(
|
||||||
|
new HtmlTextExtractor(content),
|
||||||
|
new SqliteTextExtractor(content),
|
||||||
|
new TikaTextExtractor(content));
|
||||||
|
for (TextExtractor extractor : extractors) {
|
||||||
|
extractor.setExtractionSettings(context);
|
||||||
|
if (extractor.isEnabled() && extractor.isSupported(content, mimeType)) {
|
||||||
|
return extractor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (content instanceof BlackboardArtifact) {
|
||||||
|
TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
|
||||||
|
artifactExtractor.setExtractionSettings(context);
|
||||||
|
return artifactExtractor;
|
||||||
|
} else if (content instanceof Report) {
|
||||||
|
TextExtractor reportExtractor = new TikaTextExtractor(content);
|
||||||
|
reportExtractor.setExtractionSettings(context);
|
||||||
|
return reportExtractor;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new NoTextExtractorFound(
|
||||||
|
String.format("Could not find a suitable extractor for "
|
||||||
|
+ "content with name [%s] and id=[%d]. Try using the default, "
|
||||||
|
+ "non content specific extractor as an alternative.",
|
||||||
|
content.getName(), content.getId())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Auto detects the correct
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor} given the
|
||||||
|
* {@link org.sleuthkit.datamodel.Content}.
|
||||||
|
*
|
||||||
|
* @param content Content instance that will be read from
|
||||||
|
*
|
||||||
|
* @return A TextExtractor that supports the given content. File text can be
|
||||||
|
* obtained from {@link TextExtractor#getReader()}.
|
||||||
|
*
|
||||||
|
* @throws NoTextExtractorFound Encountered when there is no TextExtractor
|
||||||
|
* was found for the given content type. Use {@link
|
||||||
|
* TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
|
||||||
|
* org.openide.util.Lookup)}
|
||||||
|
*/
|
||||||
|
public static TextExtractor getExtractor(Content content)
|
||||||
|
throws NoTextExtractorFound {
|
||||||
|
return getExtractor(content, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the default extractor that can be run on any content type. This
|
||||||
|
* extractor should be used as a backup in the event that no extractor was
|
||||||
|
* found using or {@link TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}
|
||||||
|
* {@link TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}.
|
||||||
|
*
|
||||||
|
* @param content Content source to read from
|
||||||
|
* @param context Contains extraction configurations for certain file types
|
||||||
|
*
|
||||||
|
* @return A DefaultExtractor instance. File text can be obtained from
|
||||||
|
* {@link TextExtractor#getReader()}.
|
||||||
|
*
|
||||||
|
* @see org.openide.util.Lookup
|
||||||
|
*/
|
||||||
|
public static TextExtractor getDefaultExtractor(Content content, Lookup context) {
|
||||||
|
TextExtractor stringsInstance = new StringsTextExtractor(content);
|
||||||
|
stringsInstance.setExtractionSettings(context);
|
||||||
|
return stringsInstance;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* System level exception for handling content types that have no specific
|
||||||
|
* strategy defined for extracting their text.
|
||||||
|
*
|
||||||
|
* @see
|
||||||
|
* org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)
|
||||||
|
* @see
|
||||||
|
* org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content,
|
||||||
|
* org.openide.util.Lookup)}
|
||||||
|
*/
|
||||||
|
public static class NoTextExtractorFound extends Exception {
|
||||||
|
|
||||||
|
public NoTextExtractorFound(String msg) {
|
||||||
|
super(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
public NoTextExtractorFound(Throwable ex) {
|
||||||
|
super(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -16,15 +16,19 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
import com.google.common.io.CharSource;
|
import com.google.common.io.CharSource;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PushbackReader;
|
import java.io.PushbackReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
@ -33,6 +37,7 @@ import java.util.concurrent.TimeoutException;
|
|||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
import org.apache.commons.io.FilenameUtils;
|
||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.parser.AutoDetectParser;
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
@ -44,26 +49,78 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
|||||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.modules.InstalledFileLocator;
|
import org.openide.modules.InstalledFileLocator;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.openide.util.Lookup;
|
||||||
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
|
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
|
||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts text from Tika supported content. Protects against Tika
|
* Extracts text from Tika supported content. Protects against Tika parser hangs
|
||||||
* parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
* (for unexpected/corrupt content) using a timeout mechanism.
|
||||||
*/
|
*/
|
||||||
class TikaTextExtractor extends ContentTextExtractor {
|
final class TikaTextExtractor extends TextExtractor {
|
||||||
|
|
||||||
|
//Mimetype groups to aassist extractor implementations in ignoring binary and
|
||||||
|
//archive files.
|
||||||
|
private static final List<String> BINARY_MIME_TYPES
|
||||||
|
= ImmutableList.of(
|
||||||
|
//ignore binary blob data, for which string extraction will be used
|
||||||
|
"application/octet-stream", //NON-NLS
|
||||||
|
"application/x-msdownload"); //NON-NLS
|
||||||
|
|
||||||
|
/**
|
||||||
|
* generally text extractors should ignore archives and let unpacking
|
||||||
|
* modules take care of them
|
||||||
|
*/
|
||||||
|
private static final List<String> ARCHIVE_MIME_TYPES
|
||||||
|
= ImmutableList.of(
|
||||||
|
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||||
|
"application/x-7z-compressed", //NON-NLS
|
||||||
|
"application/x-ace-compressed", //NON-NLS
|
||||||
|
"application/x-alz-compressed", //NON-NLS
|
||||||
|
"application/x-arj", //NON-NLS
|
||||||
|
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||||
|
"application/x-cfs-compressed", //NON-NLS
|
||||||
|
"application/x-dgc-compressed", //NON-NLS
|
||||||
|
"application/x-apple-diskimage", //NON-NLS
|
||||||
|
"application/x-gca-compressed", //NON-NLS
|
||||||
|
"application/x-dar", //NON-NLS
|
||||||
|
"application/x-lzx", //NON-NLS
|
||||||
|
"application/x-lzh", //NON-NLS
|
||||||
|
"application/x-rar-compressed", //NON-NLS
|
||||||
|
"application/x-stuffit", //NON-NLS
|
||||||
|
"application/x-stuffitx", //NON-NLS
|
||||||
|
"application/x-gtar", //NON-NLS
|
||||||
|
"application/x-archive", //NON-NLS
|
||||||
|
"application/x-executable", //NON-NLS
|
||||||
|
"application/x-gzip", //NON-NLS
|
||||||
|
"application/zip", //NON-NLS
|
||||||
|
"application/x-zoo", //NON-NLS
|
||||||
|
"application/x-cpio", //NON-NLS
|
||||||
|
"application/x-shar", //NON-NLS
|
||||||
|
"application/x-tar", //NON-NLS
|
||||||
|
"application/x-bzip", //NON-NLS
|
||||||
|
"application/x-bzip2", //NON-NLS
|
||||||
|
"application/x-lzip", //NON-NLS
|
||||||
|
"application/x-lzma", //NON-NLS
|
||||||
|
"application/x-lzop", //NON-NLS
|
||||||
|
"application/x-z", //NON-NLS
|
||||||
|
"application/x-compress"); //NON-NLS
|
||||||
|
|
||||||
|
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
|
|
||||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||||
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
||||||
|
|
||||||
private final AutoDetectParser parser = new AutoDetectParser();
|
private final AutoDetectParser parser = new AutoDetectParser();
|
||||||
|
private final Content content;
|
||||||
|
|
||||||
|
private boolean tesseractOCREnabled;
|
||||||
private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
|
private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
|
||||||
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
|
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
|
||||||
private static final File TESSERACT_PATH = locateTesseractExecutable();
|
private static final File TESSERACT_PATH = locateTesseractExecutable();
|
||||||
|
private static final String LANGUAGE_PACKS = getLanguagePacks();
|
||||||
|
|
||||||
private static final List<String> TIKA_SUPPORTED_TYPES
|
private static final List<String> TIKA_SUPPORTED_TYPES
|
||||||
= new Tika().getParser().getSupportedTypes(new ParseContext())
|
= new Tika().getParser().getSupportedTypes(new ParseContext())
|
||||||
@ -71,13 +128,23 @@ class TikaTextExtractor extends ContentTextExtractor {
|
|||||||
.map(mt -> mt.getType() + "/" + mt.getSubtype())
|
.map(mt -> mt.getType() + "/" + mt.getSubtype())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
@Override
|
public TikaTextExtractor(Content content) {
|
||||||
public void logWarning(final String msg, Exception ex) {
|
this.content = content;
|
||||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a reader that will iterate over the text extracted from Apache
|
||||||
|
* Tika.
|
||||||
|
*
|
||||||
|
* @param content Supported source content to extract
|
||||||
|
*
|
||||||
|
* @return Reader that contains Apache Tika extracted text
|
||||||
|
*
|
||||||
|
* @throws
|
||||||
|
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(Content content) throws TextExtractorException {
|
public Reader getReader() throws ExtractionException {
|
||||||
ReadContentInputStream stream = new ReadContentInputStream(content);
|
ReadContentInputStream stream = new ReadContentInputStream(content);
|
||||||
|
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
@ -90,28 +157,28 @@ class TikaTextExtractor extends ContentTextExtractor {
|
|||||||
officeParserConfig.setUseSAXPptxExtractor(true);
|
officeParserConfig.setUseSAXPptxExtractor(true);
|
||||||
officeParserConfig.setUseSAXDocxExtractor(true);
|
officeParserConfig.setUseSAXDocxExtractor(true);
|
||||||
parseContext.set(OfficeParserConfig.class, officeParserConfig);
|
parseContext.set(OfficeParserConfig.class, officeParserConfig);
|
||||||
|
|
||||||
// configure OCR if it is enabled in KWS settings and installed on the machine
|
// configure OCR if it is enabled in KWS settings and installed on the machine
|
||||||
if (TESSERACT_PATH != null && KeywordSearchSettings.getOcrOption() && PlatformUtil.isWindowsOS() == true) {
|
if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
|
||||||
|
|
||||||
// configure PDFParser.
|
// configure PDFParser.
|
||||||
PDFParserConfig pdfConfig = new PDFParserConfig();
|
PDFParserConfig pdfConfig = new PDFParserConfig();
|
||||||
|
|
||||||
// Extracting the inline images and letting Tesseract run on each inline image.
|
// Extracting the inline images and letting Tesseract run on each inline image.
|
||||||
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
|
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
|
||||||
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
|
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
|
||||||
pdfConfig.setExtractInlineImages(true);
|
pdfConfig.setExtractInlineImages(true);
|
||||||
// Multiple pages within a PDF file might refer to the same underlying image.
|
// Multiple pages within a PDF file might refer to the same underlying image.
|
||||||
pdfConfig.setExtractUniqueInlineImagesOnly(true);
|
pdfConfig.setExtractUniqueInlineImagesOnly(true);
|
||||||
parseContext.set(PDFParserConfig.class, pdfConfig);
|
parseContext.set(PDFParserConfig.class, pdfConfig);
|
||||||
|
|
||||||
// Configure Tesseract parser to perform OCR
|
// Configure Tesseract parser to perform OCR
|
||||||
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||||||
String tesseractFolder = TESSERACT_PATH.getParent();
|
String tesseractFolder = TESSERACT_PATH.getParent();
|
||||||
ocrConfig.setTesseractPath(tesseractFolder);
|
ocrConfig.setTesseractPath(tesseractFolder);
|
||||||
// Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
|
// Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
|
||||||
// If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
|
// If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
|
||||||
ocrConfig.setLanguage("eng");
|
ocrConfig.setLanguage(LANGUAGE_PACKS);
|
||||||
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,7 +191,7 @@ class TikaTextExtractor extends ContentTextExtractor {
|
|||||||
PushbackReader pushbackReader = new PushbackReader(tikaReader);
|
PushbackReader pushbackReader = new PushbackReader(tikaReader);
|
||||||
int read = pushbackReader.read();
|
int read = pushbackReader.read();
|
||||||
if (read == -1) {
|
if (read == -1) {
|
||||||
throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
|
throw new ExtractionException("Unable to extract text: Tika returned empty reader for " + content);
|
||||||
}
|
}
|
||||||
pushbackReader.unread(read);
|
pushbackReader.unread(read);
|
||||||
|
|
||||||
@ -133,15 +200,13 @@ class TikaTextExtractor extends ContentTextExtractor {
|
|||||||
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
|
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
|
||||||
} catch (TimeoutException te) {
|
} catch (TimeoutException te) {
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
|
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
|
||||||
logWarning(msg, te);
|
throw new ExtractionException(msg, te);
|
||||||
throw new TextExtractorException(msg, te);
|
} catch (ExtractionException ex) {
|
||||||
} catch (TextExtractorException ex) {
|
|
||||||
throw ex;
|
throw ex;
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
|
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
|
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
|
||||||
logWarning(msg, ex);
|
throw new ExtractionException(msg, ex);
|
||||||
throw new TextExtractorException(msg, ex);
|
|
||||||
} finally {
|
} finally {
|
||||||
future.cancel(true);
|
future.cancel(true);
|
||||||
}
|
}
|
||||||
@ -187,16 +252,19 @@ class TikaTextExtractor extends ContentTextExtractor {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
/**
|
||||||
public boolean isContentTypeSpecific() {
|
* Determines if Tika is supported for this content type and mimetype.
|
||||||
return true;
|
*
|
||||||
}
|
* @param content Source content to read
|
||||||
|
* @param detectedFormat Mimetype of content
|
||||||
|
*
|
||||||
|
* @return Flag indicating support for reading content type
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isSupported(Content content, String detectedFormat) {
|
public boolean isSupported(Content content, String detectedFormat) {
|
||||||
if (detectedFormat == null
|
if (detectedFormat == null
|
||||||
|| ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
|| BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||||
|| ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
|| ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||||
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||||
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
|
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
|
||||||
) {
|
) {
|
||||||
@ -205,9 +273,34 @@ class TikaTextExtractor extends ContentTextExtractor {
|
|||||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
/**
|
||||||
public boolean isDisabled() {
|
* Retrieves all of the installed language packs from their designated
|
||||||
return false;
|
* directory location to be used to configure Tesseract OCR.
|
||||||
|
*
|
||||||
|
* @return String of all language packs available for Tesseract to use
|
||||||
|
*/
|
||||||
|
private static String getLanguagePacks() {
|
||||||
|
File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
|
||||||
|
//Acceptable extensions for Tesseract-OCR version 3.05 language packs.
|
||||||
|
//All extensions other than traineddata are associated with cube files that
|
||||||
|
//have been made obsolete since version 4.0.
|
||||||
|
List<String> acceptableExtensions = Arrays.asList("traineddata", "params",
|
||||||
|
"lm", "fold", "bigrams", "nn", "word-freq", "size",
|
||||||
|
"user-patterns", "user-words");
|
||||||
|
//Pull out only unique languagePacks
|
||||||
|
HashSet<String> languagePacks = new HashSet<>();
|
||||||
|
if (languagePackRootDir.exists()) {
|
||||||
|
for (File languagePack : languagePackRootDir.listFiles()) {
|
||||||
|
if (languagePack.isDirectory() || !acceptableExtensions.contains(
|
||||||
|
FilenameUtils.getExtension(languagePack.getName()))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String threeLetterPackageName = languagePack.getName().substring(0, 3);
|
||||||
|
//Ignore the eng language pack if accidentally added
|
||||||
|
languagePacks.add(threeLetterPackageName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return String.join("+", languagePacks);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -233,6 +326,28 @@ class TikaTextExtractor extends ContentTextExtractor {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines how the extraction process will proceed given the settings
|
||||||
|
* stored in this context instance.
|
||||||
|
*
|
||||||
|
* See the ImageFileExtractionConfig class in the extractionconfigs package
|
||||||
|
* for available settings.
|
||||||
|
*
|
||||||
|
* @param context Instance containing config classes
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void setExtractionSettings(Lookup context) {
|
||||||
|
if (context != null) {
|
||||||
|
ImageFileExtractionConfig configInstance = context.lookup(ImageFileExtractionConfig.class);
|
||||||
|
if (configInstance == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (Objects.nonNull(configInstance.getOCREnabled())) {
|
||||||
|
this.tesseractOCREnabled = configInstance.getOCREnabled();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An implementation of CharSource that just wraps an existing reader and
|
* An implementation of CharSource that just wraps an existing reader and
|
||||||
* returns it in openStream().
|
* returns it in openStream().
|
@ -0,0 +1,100 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2018-2018 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.textextractors.extractionconfigs;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allows for configuration of the
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor} obtained from
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
|
||||||
|
*
|
||||||
|
* The default extractor will read strings from the Content instance. This class
|
||||||
|
* allows for the configuration of the encoding language script to use during
|
||||||
|
* extraction.
|
||||||
|
*
|
||||||
|
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
|
||||||
|
* @see
|
||||||
|
* org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
|
||||||
|
* @see org.openide.util.Lookup
|
||||||
|
*/
|
||||||
|
public class DefaultExtractionConfig {
|
||||||
|
|
||||||
|
private Boolean extractUTF8;
|
||||||
|
private Boolean extractUTF16;
|
||||||
|
private List<SCRIPT> extractScripts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enables UTF-8 encoding to be used during extraction.
|
||||||
|
*
|
||||||
|
* @param enabled Flag indicating if UTF-8 should be turned on
|
||||||
|
*/
|
||||||
|
public void setExtractUTF8(boolean enabled) {
|
||||||
|
this.extractUTF8 = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enables UTF-16 encoding to be used during extraction.
|
||||||
|
*
|
||||||
|
* @param enabled Flag indicating if UTF-16 should be turned on
|
||||||
|
*/
|
||||||
|
public void setExtractUTF16(boolean enabled) {
|
||||||
|
this.extractUTF16 = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether extracting with UTF-8 encoding should be done.
|
||||||
|
*
|
||||||
|
* @return Flag indicating if UTF-8 has been turned on/off
|
||||||
|
*/
|
||||||
|
public Boolean getExtractUTF8() {
|
||||||
|
return extractUTF8;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return whether extracting with UTF-16 encoding should be done.
|
||||||
|
*
|
||||||
|
* @return Flag indicating if UTF-16 has been turned on/off
|
||||||
|
*/
|
||||||
|
public Boolean getExtractUTF16() {
|
||||||
|
return extractUTF16;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the type of extraction scripts that will be used during this
|
||||||
|
* extraction. See
|
||||||
|
* {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT}
|
||||||
|
* for more information about available scripts.
|
||||||
|
*
|
||||||
|
* @param scripts Desired set of scripts to be used during extraction
|
||||||
|
*/
|
||||||
|
public void setExtractScripts(List<SCRIPT> scripts) {
|
||||||
|
this.extractScripts = scripts;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the desired set of scripts to be used during extraction.
|
||||||
|
*
|
||||||
|
* @return Set of extraction scripts to be used
|
||||||
|
*/
|
||||||
|
public List<SCRIPT> getExtractScripts() {
|
||||||
|
return this.extractScripts;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2018-2018 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.textextractors.extractionconfigs;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allows for configuration of OCR on image files.
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor}'s that use
|
||||||
|
* ImageFileExtractionConfig can be obtained through
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getExtractor(org.sleuthkit.datamodel.Content)}
|
||||||
|
* or
|
||||||
|
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory#getDefaultExtractor(org.sleuthkit.datamodel.Content, org.openide.util.Lookup)}.
|
||||||
|
*
|
||||||
|
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
|
||||||
|
* @see org.openide.util.Lookup
|
||||||
|
*/
|
||||||
|
public class ImageFileExtractionConfig {
|
||||||
|
|
||||||
|
private Boolean OCREnabled;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enables OCR to be run on the text extractor responsible for handling
|
||||||
|
* image files.
|
||||||
|
*
|
||||||
|
* @param enabled Flag indicating if OCR is enabled.
|
||||||
|
*/
|
||||||
|
public void setOCREnabled(boolean enabled) {
|
||||||
|
this.OCREnabled = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the OCR flag that has been set. By default this flag is turned off.
|
||||||
|
*
|
||||||
|
* @return Flag indicating if OCR is enabled.
|
||||||
|
*/
|
||||||
|
public boolean getOCREnabled() {
|
||||||
|
return this.OCREnabled;
|
||||||
|
}
|
||||||
|
}
|
@ -19,7 +19,7 @@
|
|||||||
package org.sleuthkit.autopsy.texttranslation;
|
package org.sleuthkit.autopsy.texttranslation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides a system exception for the Text Translation errors
|
* Provides a system exception for Text Translation errors
|
||||||
*/
|
*/
|
||||||
public class TranslationException extends Exception {
|
public class TranslationException extends Exception {
|
||||||
|
|
||||||
|
@ -1,150 +0,0 @@
|
|||||||
/*
|
|
||||||
* Autopsy Forensic Browser
|
|
||||||
*
|
|
||||||
* Copyright 2011-2018 Basis Technology Corp.
|
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.logging.Level;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.sleuthkit.autopsy.casemodule.Case;
|
|
||||||
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
|
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
|
||||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
|
||||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extracts text from artifacts by concatenating the values of all of the
|
|
||||||
* artifact's attributes.
|
|
||||||
*/
|
|
||||||
class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
|
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the Content that is the data source for the given artifact. //JMTODO:
|
|
||||||
* is there a prexisting method to do this?
|
|
||||||
*
|
|
||||||
* @param artifact
|
|
||||||
*
|
|
||||||
* @return The data source for the given artifact as a Content object, or
|
|
||||||
* null if it could not be found.
|
|
||||||
*
|
|
||||||
* @throws TskCoreException if there is a problem accessing the case db.
|
|
||||||
*/
|
|
||||||
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
|
|
||||||
|
|
||||||
Case currentCase;
|
|
||||||
try {
|
|
||||||
currentCase = Case.getCurrentCaseThrows();
|
|
||||||
} catch (NoCurrentCaseException ignore) {
|
|
||||||
// thorown by Case.getCurrentOpenCase() if currentCase is null
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
|
|
||||||
if (sleuthkitCase == null) {
|
|
||||||
return null;
|
|
||||||
|
|
||||||
}
|
|
||||||
Content dataSource;
|
|
||||||
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
|
|
||||||
if (abstractFile != null) {
|
|
||||||
dataSource = abstractFile.getDataSource();
|
|
||||||
} else {
|
|
||||||
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dataSource == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return dataSource;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(final String msg, Exception ex) {
|
|
||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
|
||||||
}
|
|
||||||
|
|
||||||
private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException {
|
|
||||||
// Concatenate the string values of all attributes into a single
|
|
||||||
// "content" string to be indexed.
|
|
||||||
StringBuilder artifactContents = new StringBuilder();
|
|
||||||
|
|
||||||
Content dataSource = null;
|
|
||||||
try {
|
|
||||||
dataSource = getDataSource(artifact);
|
|
||||||
} catch (TskCoreException tskCoreException) {
|
|
||||||
throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
|
|
||||||
}
|
|
||||||
if (dataSource == null) {
|
|
||||||
throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
|
||||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
|
||||||
artifactContents.append(" : ");
|
|
||||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
|
||||||
// to magically format datetime attributes but that is complicated by
|
|
||||||
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
|
||||||
// while the utility to determine the timezone to use is in ContentUtils
|
|
||||||
// in the Autopsy datamodel.
|
|
||||||
switch (attribute.getValueType()) {
|
|
||||||
case DATETIME:
|
|
||||||
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
artifactContents.append(attribute.getDisplayString());
|
|
||||||
}
|
|
||||||
artifactContents.append(System.lineSeparator());
|
|
||||||
}
|
|
||||||
} catch (TskCoreException tskCoreException) {
|
|
||||||
throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
|
|
||||||
}
|
|
||||||
|
|
||||||
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Reader getReader(BlackboardArtifact source) throws TextExtractorException {
|
|
||||||
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long getID(BlackboardArtifact source) {
|
|
||||||
return source.getArtifactID();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getName(BlackboardArtifact source) {
|
|
||||||
return source.getDisplayName() + "_" + source.getArtifactID();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,110 +0,0 @@
|
|||||||
/*
|
|
||||||
* Autopsy Forensic Browser
|
|
||||||
*
|
|
||||||
* Copyright 2011-2018 Basis Technology Corp.
|
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Common methods for utilities that extract text and content and divide into
|
|
||||||
* chunks
|
|
||||||
*/
|
|
||||||
abstract class ContentTextExtractor implements TextExtractor<Content> {
|
|
||||||
|
|
||||||
|
|
||||||
static final List<String> BINARY_MIME_TYPES
|
|
||||||
= Arrays.asList(
|
|
||||||
//ignore binary blob data, for which string extraction will be used
|
|
||||||
"application/octet-stream", //NON-NLS
|
|
||||||
"application/x-msdownload"); //NON-NLS
|
|
||||||
|
|
||||||
/** generally text extractors should ignore archives and let unpacking
|
|
||||||
* modules take care of them */
|
|
||||||
static final List<String> ARCHIVE_MIME_TYPES
|
|
||||||
= Arrays.asList(
|
|
||||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
|
||||||
"application/x-7z-compressed", //NON-NLS
|
|
||||||
"application/x-ace-compressed", //NON-NLS
|
|
||||||
"application/x-alz-compressed", //NON-NLS
|
|
||||||
"application/x-arj", //NON-NLS
|
|
||||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
|
||||||
"application/x-cfs-compressed", //NON-NLS
|
|
||||||
"application/x-dgc-compressed", //NON-NLS
|
|
||||||
"application/x-apple-diskimage", //NON-NLS
|
|
||||||
"application/x-gca-compressed", //NON-NLS
|
|
||||||
"application/x-dar", //NON-NLS
|
|
||||||
"application/x-lzx", //NON-NLS
|
|
||||||
"application/x-lzh", //NON-NLS
|
|
||||||
"application/x-rar-compressed", //NON-NLS
|
|
||||||
"application/x-stuffit", //NON-NLS
|
|
||||||
"application/x-stuffitx", //NON-NLS
|
|
||||||
"application/x-gtar", //NON-NLS
|
|
||||||
"application/x-archive", //NON-NLS
|
|
||||||
"application/x-executable", //NON-NLS
|
|
||||||
"application/x-gzip", //NON-NLS
|
|
||||||
"application/zip", //NON-NLS
|
|
||||||
"application/x-zoo", //NON-NLS
|
|
||||||
"application/x-cpio", //NON-NLS
|
|
||||||
"application/x-shar", //NON-NLS
|
|
||||||
"application/x-tar", //NON-NLS
|
|
||||||
"application/x-bzip", //NON-NLS
|
|
||||||
"application/x-bzip2", //NON-NLS
|
|
||||||
"application/x-lzip", //NON-NLS
|
|
||||||
"application/x-lzma", //NON-NLS
|
|
||||||
"application/x-lzop", //NON-NLS
|
|
||||||
"application/x-z", //NON-NLS
|
|
||||||
"application/x-compress"); //NON-NLS
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if the extractor works only for specified types is
|
|
||||||
* supportedTypes() or whether is a generic content extractor (such as
|
|
||||||
* string extractor)
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
abstract boolean isContentTypeSpecific();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if the file content is supported by the extractor if
|
|
||||||
* isContentTypeSpecific() returns true.
|
|
||||||
*
|
|
||||||
* @param content to test if its content should be supported
|
|
||||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
|
||||||
* or null if not detected
|
|
||||||
*
|
|
||||||
* @return true if the file content is supported, false otherwise
|
|
||||||
*/
|
|
||||||
abstract boolean isSupported(Content file, String detectedFormat);
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public abstract Reader getReader(Content source) throws TextExtractorException;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long getID(Content source) {
|
|
||||||
return source.getId();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getName(Content source) {
|
|
||||||
return source.getName();
|
|
||||||
}
|
|
||||||
}
|
|
@ -19,6 +19,7 @@
|
|||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
@ -58,7 +59,6 @@ class Ingester {
|
|||||||
private final Server solrServer = KeywordSearch.getServer();
|
private final Server solrServer = KeywordSearch.getServer();
|
||||||
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
||||||
private static Ingester instance;
|
private static Ingester instance;
|
||||||
private static final int SINGLE_READ_CHARS = 512;
|
|
||||||
|
|
||||||
private Ingester() {
|
private Ingester() {
|
||||||
}
|
}
|
||||||
@ -106,8 +106,8 @@ class Ingester {
|
|||||||
* @throws IngesterException if there was an error processing a specific
|
* @throws IngesterException if there was an error processing a specific
|
||||||
* artifact, but the Solr server is probably fine.
|
* artifact, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
|
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
||||||
indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact));
|
indexChunk("", sourceName, getContentFields(artifact));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -142,23 +142,12 @@ class Ingester {
|
|||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
|
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
|
||||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||||
final long sourceID = extractor.getID(source);
|
|
||||||
final String sourceName = extractor.getName(source);
|
|
||||||
|
|
||||||
int numChunks = 0; //unknown until chunking is done
|
int numChunks = 0; //unknown until chunking is done
|
||||||
|
|
||||||
if (extractor.isDisabled()) {
|
|
||||||
/*
|
|
||||||
* some Extractors, notable the strings extractor, have options
|
|
||||||
* which can be configured such that no extraction should be done
|
|
||||||
*/
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Map<String, String> fields = getContentFields(source);
|
Map<String, String> fields = getContentFields(source);
|
||||||
//Get a reader for the content of the given source
|
//Get a reader for the content of the given source
|
||||||
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
|
try (BufferedReader reader = new BufferedReader(sourceReader)) {
|
||||||
Chunker chunker = new Chunker(reader);
|
Chunker chunker = new Chunker(reader);
|
||||||
for (Chunk chunk : chunker) {
|
for (Chunk chunk : chunker) {
|
||||||
if (context != null && context.fileIngestIsCancelled()) {
|
if (context != null && context.fileIngestIsCancelled()) {
|
||||||
@ -173,18 +162,18 @@ class Ingester {
|
|||||||
indexChunk(chunk.toString(), sourceName, fields);
|
indexChunk(chunk.toString(), sourceName, fields);
|
||||||
numChunks++;
|
numChunks++;
|
||||||
} catch (Ingester.IngesterException ingEx) {
|
} catch (Ingester.IngesterException ingEx) {
|
||||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
|
||||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||||
|
|
||||||
throw ingEx; //need to rethrow to signal error and move on
|
throw ingEx; //need to rethrow to signal error and move on
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (chunker.hasException()) {
|
if (chunker.hasException()) {
|
||||||
extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
|
logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
return false;
|
return false;
|
||||||
} finally {
|
} finally {
|
||||||
if (context != null && context.fileIngestIsCancelled()) {
|
if (context != null && context.fileIngestIsCancelled()) {
|
||||||
@ -371,7 +360,7 @@ class Ingester {
|
|||||||
Map<String, String> params = new HashMap<>();
|
Map<String, String> params = new HashMap<>();
|
||||||
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
|
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
|
||||||
try {
|
try {
|
||||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId()));
|
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
|
||||||
} catch (TskCoreException ex) {
|
} catch (TskCoreException ex) {
|
||||||
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
|
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
|
||||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
|
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
|
||||||
|
@ -35,6 +35,7 @@ import org.sleuthkit.autopsy.coreutils.PlatformUtil;
|
|||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestManager;
|
import org.sleuthkit.autopsy.ingest.IngestManager;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Child panel of the global settings panel (Languages tab).
|
* Child panel of the global settings panel (Languages tab).
|
||||||
@ -45,7 +46,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
|||||||
private final Map<String, StringExtract.StringExtractUnicodeTable.SCRIPT> scripts = new HashMap<>();
|
private final Map<String, StringExtract.StringExtractUnicodeTable.SCRIPT> scripts = new HashMap<>();
|
||||||
private ActionListener updateLanguagesAction;
|
private ActionListener updateLanguagesAction;
|
||||||
private List<SCRIPT> toUpdate;
|
private List<SCRIPT> toUpdate;
|
||||||
|
|
||||||
KeywordSearchGlobalLanguageSettingsPanel() {
|
KeywordSearchGlobalLanguageSettingsPanel() {
|
||||||
initComponents();
|
initComponents();
|
||||||
customizeComponents();
|
customizeComponents();
|
||||||
@ -125,12 +126,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
|||||||
|
|
||||||
private void reloadScriptsCheckBoxes() {
|
private void reloadScriptsCheckBoxes() {
|
||||||
boolean utf16
|
boolean utf16
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
enableUTF16Checkbox.setSelected(utf16);
|
enableUTF16Checkbox.setSelected(utf16);
|
||||||
|
|
||||||
boolean utf8
|
boolean utf8
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
enableUTF8Checkbox.setSelected(utf8);
|
enableUTF8Checkbox.setSelected(utf8);
|
||||||
|
|
||||||
boolean ocr = KeywordSearchSettings.getOcrOption();
|
boolean ocr = KeywordSearchSettings.getOcrOption();
|
||||||
@ -152,12 +153,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
|||||||
reloadScriptsCheckBoxes();
|
reloadScriptsCheckBoxes();
|
||||||
|
|
||||||
boolean utf16
|
boolean utf16
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
enableUTF16Checkbox.setSelected(utf16);
|
enableUTF16Checkbox.setSelected(utf16);
|
||||||
|
|
||||||
boolean utf8
|
boolean utf8
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
enableUTF8Checkbox.setSelected(utf8);
|
enableUTF8Checkbox.setSelected(utf8);
|
||||||
final boolean extractEnabled = utf16 || utf8;
|
final boolean extractEnabled = utf16 || utf8;
|
||||||
|
|
||||||
@ -316,9 +317,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void store() {
|
public void store() {
|
||||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
|
KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString(),
|
||||||
Boolean.toString(enableUTF8Checkbox.isSelected()));
|
Boolean.toString(enableUTF8Checkbox.isSelected()));
|
||||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
|
KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString(),
|
||||||
Boolean.toString(enableUTF16Checkbox.isSelected()));
|
Boolean.toString(enableUTF16Checkbox.isSelected()));
|
||||||
KeywordSearchSettings.setOcrOption(enableOcrCheckbox.isSelected());
|
KeywordSearchSettings.setOcrOption(enableOcrCheckbox.isSelected());
|
||||||
|
|
||||||
|
@ -18,14 +18,18 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
|
import org.openide.util.Exceptions;
|
||||||
|
import org.openide.util.Lookup;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.util.NbBundle.Messages;
|
import org.openide.util.NbBundle.Messages;
|
||||||
|
import org.openide.util.lookup.Lookups;
|
||||||
import org.sleuthkit.autopsy.casemodule.Case;
|
import org.sleuthkit.autopsy.casemodule.Case;
|
||||||
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
|
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
@ -37,9 +41,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
|
|||||||
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestServices;
|
import org.sleuthkit.autopsy.ingest.IngestServices;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.TskData;
|
import org.sleuthkit.datamodel.TskData;
|
||||||
import org.sleuthkit.datamodel.TskData.FileKnown;
|
import org.sleuthkit.datamodel.TskData.FileKnown;
|
||||||
@ -61,6 +71,52 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
|
|||||||
"CannotRunFileTypeDetection=Unable to run file type detection."
|
"CannotRunFileTypeDetection=Unable to run file type detection."
|
||||||
})
|
})
|
||||||
public final class KeywordSearchIngestModule implements FileIngestModule {
|
public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||||
|
|
||||||
|
/** generally text extractors should ignore archives and let unpacking
|
||||||
|
* modules take care of them */
|
||||||
|
public static final List<String> ARCHIVE_MIME_TYPES
|
||||||
|
= ImmutableList.of(
|
||||||
|
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||||
|
"application/x-7z-compressed", //NON-NLS
|
||||||
|
"application/x-ace-compressed", //NON-NLS
|
||||||
|
"application/x-alz-compressed", //NON-NLS
|
||||||
|
"application/x-arj", //NON-NLS
|
||||||
|
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||||
|
"application/x-cfs-compressed", //NON-NLS
|
||||||
|
"application/x-dgc-compressed", //NON-NLS
|
||||||
|
"application/x-apple-diskimage", //NON-NLS
|
||||||
|
"application/x-gca-compressed", //NON-NLS
|
||||||
|
"application/x-dar", //NON-NLS
|
||||||
|
"application/x-lzx", //NON-NLS
|
||||||
|
"application/x-lzh", //NON-NLS
|
||||||
|
"application/x-rar-compressed", //NON-NLS
|
||||||
|
"application/x-stuffit", //NON-NLS
|
||||||
|
"application/x-stuffitx", //NON-NLS
|
||||||
|
"application/x-gtar", //NON-NLS
|
||||||
|
"application/x-archive", //NON-NLS
|
||||||
|
"application/x-executable", //NON-NLS
|
||||||
|
"application/x-gzip", //NON-NLS
|
||||||
|
"application/zip", //NON-NLS
|
||||||
|
"application/x-zoo", //NON-NLS
|
||||||
|
"application/x-cpio", //NON-NLS
|
||||||
|
"application/x-shar", //NON-NLS
|
||||||
|
"application/x-tar", //NON-NLS
|
||||||
|
"application/x-bzip", //NON-NLS
|
||||||
|
"application/x-bzip2", //NON-NLS
|
||||||
|
"application/x-lzip", //NON-NLS
|
||||||
|
"application/x-lzma", //NON-NLS
|
||||||
|
"application/x-lzop", //NON-NLS
|
||||||
|
"application/x-z", //NON-NLS
|
||||||
|
"application/x-compress"); //NON-NLS
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Options for this extractor
|
||||||
|
*/
|
||||||
|
enum StringsExtractOptions {
|
||||||
|
EXTRACT_UTF16, ///< extract UTF16 text, true/false
|
||||||
|
EXTRACT_UTF8, ///< extract UTF8 text, true/false
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
enum UpdateFrequency {
|
enum UpdateFrequency {
|
||||||
|
|
||||||
@ -89,13 +145,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
//accessed read-only by searcher thread
|
//accessed read-only by searcher thread
|
||||||
|
|
||||||
private boolean startedSearching = false;
|
private boolean startedSearching = false;
|
||||||
private List<ContentTextExtractor> textExtractors;
|
private Lookup stringsExtractionContext;
|
||||||
private StringsTextExtractor stringExtractor;
|
|
||||||
private TextFileExtractor txtFileExtractor;
|
|
||||||
private final KeywordSearchJobSettings settings;
|
private final KeywordSearchJobSettings settings;
|
||||||
private boolean initialized = false;
|
private boolean initialized = false;
|
||||||
private long jobId;
|
private long jobId;
|
||||||
private long dataSourceId;
|
|
||||||
private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
|
private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
|
||||||
private int instanceNum = 0;
|
private int instanceNum = 0;
|
||||||
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
|
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
|
||||||
@ -152,7 +205,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
public void startUp(IngestJobContext context) throws IngestModuleException {
|
public void startUp(IngestJobContext context) throws IngestModuleException {
|
||||||
initialized = false;
|
initialized = false;
|
||||||
jobId = context.getJobId();
|
jobId = context.getJobId();
|
||||||
dataSourceId = context.getDataSource().getId();
|
|
||||||
|
|
||||||
Server server = KeywordSearch.getServer();
|
Server server = KeywordSearch.getServer();
|
||||||
if (server.coreIsOpen() == false) {
|
if (server.coreIsOpen() == false) {
|
||||||
@ -238,22 +290,15 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//initialize extractors
|
DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
|
||||||
stringExtractor = new StringsTextExtractor();
|
Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
|
||||||
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
|
stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
|
||||||
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
|
stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
|
||||||
|
stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
|
||||||
txtFileExtractor = new TextFileExtractor();
|
|
||||||
|
stringsExtractionContext = Lookups.fixed(stringsConfig);
|
||||||
textExtractors = new ArrayList<>();
|
|
||||||
//order matters, more specific extractors first
|
|
||||||
textExtractors.add(new HtmlTextExtractor());
|
|
||||||
//Add sqlite text extractor to be default for sqlite files, since tika stuggles
|
|
||||||
//with them. See SqliteTextExtractor class for specifics
|
|
||||||
textExtractors.add(new SqliteTextExtractor());
|
|
||||||
textExtractors.add(new TikaTextExtractor());
|
|
||||||
|
|
||||||
indexer = new Indexer();
|
indexer = new Indexer();
|
||||||
initialized = true;
|
initialized = true;
|
||||||
}
|
}
|
||||||
@ -345,10 +390,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
* Common cleanup code when module stops or final searcher completes
|
* Common cleanup code when module stops or final searcher completes
|
||||||
*/
|
*/
|
||||||
private void cleanup() {
|
private void cleanup() {
|
||||||
textExtractors.clear();
|
stringsExtractionContext = null;
|
||||||
textExtractors = null;
|
|
||||||
stringExtractor = null;
|
|
||||||
txtFileExtractor = null;
|
|
||||||
initialized = false;
|
initialized = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -436,24 +478,18 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
* @throws IngesterException exception thrown if indexing failed
|
* @throws IngesterException exception thrown if indexing failed
|
||||||
*/
|
*/
|
||||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||||
ContentTextExtractor extractor = null;
|
ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
|
||||||
|
imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
|
||||||
//go over available text extractors in order, and pick the first one (most specific one)
|
Lookup extractionContext = Lookups.fixed(imageConfig);
|
||||||
for (ContentTextExtractor fe : textExtractors) {
|
|
||||||
if (fe.isSupported(aFile, detectedFormat)) {
|
try {
|
||||||
extractor = fe;
|
Reader specializedReader = TextExtractorFactory.getExtractor(aFile,extractionContext).getReader();
|
||||||
break;
|
//divide into chunks and index
|
||||||
}
|
return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
|
||||||
}
|
} catch (TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
|
||||||
|
//No text extractor found... run the default instead
|
||||||
if (extractor == null) {
|
|
||||||
// No text extractor found.
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
|
||||||
//divide into chunks and index
|
|
||||||
return Ingester.getDefault().indexText(extractor, aFile, context);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -469,7 +505,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
if (context.fileIngestIsCancelled()) {
|
if (context.fileIngestIsCancelled()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
|
Reader stringsReader = TextExtractorFactory.getDefaultExtractor(aFile, stringsExtractionContext).getReader();
|
||||||
|
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
@ -477,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} catch (IngesterException ex) {
|
} catch (IngesterException | ExtractionException ex) {
|
||||||
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
|
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
|
||||||
return false;
|
return false;
|
||||||
@ -529,7 +566,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
|
|
||||||
// we skip archive formats that are opened by the archive module.
|
// we skip archive formats that are opened by the archive module.
|
||||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||||
if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
if (ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||||
try {
|
try {
|
||||||
if (context.fileIngestIsCancelled()) {
|
if (context.fileIngestIsCancelled()) {
|
||||||
return;
|
return;
|
||||||
@ -577,11 +614,13 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
//Carved Files should be the only type of unallocated files capable of a txt extension and
|
//Carved Files should be the only type of unallocated files capable of a txt extension and
|
||||||
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
||||||
try {
|
try {
|
||||||
if (Ingester.getDefault().indexText(txtFileExtractor, aFile, context)) {
|
TextFileExtractor textFileExtractor = new TextFileExtractor();
|
||||||
|
Reader textReader = textFileExtractor.getReader(aFile);
|
||||||
|
if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
||||||
wasTextAdded = true;
|
wasTextAdded = true;
|
||||||
}
|
}
|
||||||
} catch (IngesterException ex) {
|
} catch (IngesterException | TextFileExtractorException ex) {
|
||||||
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,6 +31,7 @@ import javax.swing.table.TableColumn;
|
|||||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettings;
|
import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettings;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettingsPanel;
|
import org.sleuthkit.autopsy.ingest.IngestModuleIngestJobSettingsPanel;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Ingest job settings panel for keyword search file ingest modules.
|
* Ingest job settings panel for keyword search file ingest modules.
|
||||||
@ -102,8 +103,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void displayEncodings() {
|
private void displayEncodings() {
|
||||||
String utf8 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
|
String utf8 = KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString());
|
||||||
String utf16 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
|
String utf16 = KeywordSearchSettings.getStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString());
|
||||||
ArrayList<String> encodingsList = new ArrayList<>();
|
ArrayList<String> encodingsList = new ArrayList<>();
|
||||||
if (utf8 == null || Boolean.parseBoolean(utf8)) {
|
if (utf8 == null || Boolean.parseBoolean(utf8)) {
|
||||||
encodingsList.add("UTF8");
|
encodingsList.add("UTF8");
|
||||||
|
@ -28,6 +28,7 @@ import org.sleuthkit.autopsy.coreutils.Logger;
|
|||||||
import org.sleuthkit.autopsy.coreutils.ModuleSettings;
|
import org.sleuthkit.autopsy.coreutils.ModuleSettings;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.StringsExtractOptions;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.UpdateFrequency;
|
import org.sleuthkit.autopsy.keywordsearch.KeywordSearchIngestModule.UpdateFrequency;
|
||||||
|
|
||||||
//This file contains constants and settings for KeywordSearch
|
//This file contains constants and settings for KeywordSearch
|
||||||
@ -234,14 +235,14 @@ class KeywordSearchSettings {
|
|||||||
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
|
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
|
||||||
}
|
}
|
||||||
//setting default Extract UTF8
|
//setting default Extract UTF8
|
||||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
|
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsExtractOptions.EXTRACT_UTF8.toString())) {
|
||||||
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
|
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
|
||||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
||||||
}
|
}
|
||||||
//setting default Extract UTF16
|
//setting default Extract UTF16
|
||||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
|
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsExtractOptions.EXTRACT_UTF16.toString())) {
|
||||||
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
|
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
|
||||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
KeywordSearchSettings.setStringExtractOption(StringsExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
||||||
}
|
}
|
||||||
//setting OCR default (disabled by default)
|
//setting OCR default (disabled by default)
|
||||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, OCR_ENABLED)) {
|
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, OCR_ENABLED)) {
|
||||||
|
@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.lang.reflect.InvocationTargetException;
|
import java.lang.reflect.InvocationTargetException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -45,6 +46,9 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
|
|||||||
import org.sleuthkit.autopsy.progress.ProgressIndicator;
|
import org.sleuthkit.autopsy.progress.ProgressIndicator;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
@ -112,19 +116,24 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
ingester.indexMetaDataOnly(artifact);
|
Reader blackboardReader = TextExtractorFactory
|
||||||
ingester.indexText(new ArtifactTextExtractor(), artifact, null);
|
.getExtractor(content, null).getReader();
|
||||||
} catch (Ingester.IngesterException ex) {
|
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||||
|
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||||
|
ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
|
||||||
|
} catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
ingester.indexText(new TikaTextExtractor(), content, null);
|
Reader contentReader = TextExtractorFactory
|
||||||
} catch (Ingester.IngesterException ex) {
|
.getExtractor(content, null).getReader();
|
||||||
|
ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
|
||||||
|
} catch (TextExtractorFactory.NoTextExtractorFound | ExtractionException | Ingester.IngesterException ex) {
|
||||||
try {
|
try {
|
||||||
// Try the StringsTextExtractor if Tika extractions fails.
|
// Try the StringsTextExtractor if Tika extractions fails.
|
||||||
ingester.indexText(new StringsTextExtractor(), content, null);
|
ingester.indexText(TextExtractorFactory.getDefaultExtractor(content, null).getReader(),content.getId(),content.getName(), content, null);
|
||||||
} catch (Ingester.IngesterException ex1) {
|
} catch (Ingester.IngesterException | ExtractionException ex1) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex1);
|
throw new TskCoreException(ex.getCause().getMessage(), ex1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -437,9 +446,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
|||||||
final Ingester ingester = Ingester.getDefault();
|
final Ingester ingester = Ingester.getDefault();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ingester.indexMetaDataOnly(artifact);
|
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||||
ingester.indexText(new ArtifactTextExtractor(), artifact, null);
|
Reader contentSpecificReader =
|
||||||
} catch (Ingester.IngesterException ex) {
|
TextExtractorFactory.getExtractor((Content) artifact, null).getReader();
|
||||||
|
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||||
|
ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
|
||||||
|
} catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | ExtractionException ex) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,88 +0,0 @@
|
|||||||
/*
|
|
||||||
* Autopsy Forensic Browser
|
|
||||||
*
|
|
||||||
* Copyright 2011-16 Basis Technology Corp.
|
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
|
|
||||||
* This Reader is given to the Ingester to chunk and index in Solr.
|
|
||||||
*
|
|
||||||
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
|
|
||||||
* is able to process.
|
|
||||||
*/
|
|
||||||
interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Is this extractor configured such that no extraction will/should be done?
|
|
||||||
*
|
|
||||||
* @return True if this extractor will/should not perform any extraction.
|
|
||||||
*/
|
|
||||||
abstract boolean isDisabled();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Log the given message and exception as a warning.
|
|
||||||
*
|
|
||||||
* @param msg
|
|
||||||
* @param ex
|
|
||||||
*/
|
|
||||||
abstract void logWarning(String msg, Exception ex);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a reader that over the text extracted from the given source.
|
|
||||||
*
|
|
||||||
* @param stream
|
|
||||||
* @param source
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*
|
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
|
||||||
*/
|
|
||||||
abstract Reader getReader(TextSource source) throws TextExtractorException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the 'object' id of the given source.
|
|
||||||
*
|
|
||||||
* @param source
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
abstract long getID(TextSource source);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a human readable name for the given source.
|
|
||||||
*
|
|
||||||
* @param source
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
abstract String getName(TextSource source);
|
|
||||||
|
|
||||||
class TextExtractorException extends Exception {
|
|
||||||
|
|
||||||
public TextExtractorException(String message) {
|
|
||||||
super(message);
|
|
||||||
}
|
|
||||||
|
|
||||||
public TextExtractorException(String message, Throwable cause) {
|
|
||||||
super(message, cause);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -21,17 +21,15 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.logging.Level;
|
|
||||||
import org.apache.tika.parser.txt.CharsetDetector;
|
import org.apache.tika.parser.txt.CharsetDetector;
|
||||||
import org.apache.tika.parser.txt.CharsetMatch;
|
import org.apache.tika.parser.txt.CharsetMatch;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract text from .txt files
|
* Extract text from .txt files
|
||||||
*/
|
*/
|
||||||
final class TextFileExtractor extends ContentTextExtractor {
|
final class TextFileExtractor {
|
||||||
|
|
||||||
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
|
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
|
||||||
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
|
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
|
||||||
@ -39,44 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor {
|
|||||||
//This limited information was used to select the current value as one that would filter out clearly non-text
|
//This limited information was used to select the current value as one that would filter out clearly non-text
|
||||||
//files while hopefully working on all files with a valid text encoding
|
//files while hopefully working on all files with a valid text encoding
|
||||||
static final private int MIN_MATCH_CONFIDENCE = 20;
|
static final private int MIN_MATCH_CONFIDENCE = 20;
|
||||||
static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
|
|
||||||
|
|
||||||
@Override
|
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
|
||||||
boolean isContentTypeSpecific() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
boolean isSupported(Content file, String detectedFormat) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Reader getReader(Content source) throws TextExtractorException {
|
|
||||||
CharsetDetector detector = new CharsetDetector();
|
CharsetDetector detector = new CharsetDetector();
|
||||||
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
|
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
|
||||||
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
|
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
|
||||||
try {
|
try {
|
||||||
detector.setText(stream);
|
detector.setText(stream);
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
||||||
}
|
}
|
||||||
CharsetMatch match = detector.detect();
|
CharsetMatch match = detector.detect();
|
||||||
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
||||||
throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
||||||
}
|
}
|
||||||
|
|
||||||
return match.getReader();
|
return match.getReader();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public class TextFileExtractorException extends Exception {
|
||||||
public boolean isDisabled() {
|
public TextFileExtractorException(String msg, Throwable ex) {
|
||||||
return false;
|
super(msg, ex);
|
||||||
|
}
|
||||||
|
public TextFileExtractorException(String msg) {
|
||||||
|
super(msg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(String msg, Exception ex) {
|
|
||||||
logger.log(Level.WARNING, msg, ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user