From 8b8196c21899a1c90730bdf585fd92b43fc6dc6c Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dsmyda" Date: Thu, 15 Nov 2018 14:40:49 -0500 Subject: [PATCH 01/18] Did the refactoring and fleshing out, seems to be working, but will test further --- Core/ivy.xml | 1 + Core/nbproject/project.properties | 1 + Core/nbproject/project.xml | 31 ++--- .../textextractors}/ContentTextExtractor.java | 13 ++- .../textextractors/ExtractionContext.java | 28 +++++ .../textextractors}/HtmlTextExtractor.java | 27 +++-- .../textextractors}/SqliteTextExtractor.java | 13 ++- .../autopsy/textextractors}/SqliteUtil.java | 2 +- .../textextractors}/StringsTextExtractor.java | 85 ++++++-------- .../textextractors}/TextExtractor.java | 8 +- .../textextractors/TextExtractorFactory.java | 48 ++++++++ .../textextractors}/TikaTextExtractor.java | 25 ++-- .../HTMLExtractionConfig.java | 22 ++++ .../ImageFileExtractionConfig.java | 22 ++++ .../StringsExtractionConfig.java | 43 +++++++ .../TextFileExtractionConfig.java | 22 ++++ .../keywordsearch/ArtifactTextExtractor.java | 1 + .../autopsy/keywordsearch/Ingester.java | 2 +- ...wordSearchGlobalLanguageSettingsPanel.java | 13 ++- .../KeywordSearchIngestModule.java | 108 ++++++++++++------ .../KeywordSearchJobSettingsPanel.java | 5 +- .../keywordsearch/KeywordSearchSettings.java | 8 +- .../keywordsearch/SolrSearchService.java | 2 + .../keywordsearch/TextFileExtractor.java | 10 +- 24 files changed, 398 insertions(+), 142 deletions(-) rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/ContentTextExtractor.java (92%) create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/HtmlTextExtractor.java (88%) rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/SqliteTextExtractor.java (97%) rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/SqliteUtil.java (99%) rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/StringsTextExtractor.java (92%) rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/TextExtractor.java (91%) create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => Core/src/org/sleuthkit/autopsy/textextractors}/TikaTextExtractor.java (91%) create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/HTMLExtractionConfig.java create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/ImageFileExtractionConfig.java create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/StringsExtractionConfig.java create mode 100755 Core/src/org/sleuthkit/autopsy/textextractors/extractionconfigs/TextFileExtractionConfig.java diff --git a/Core/ivy.xml b/Core/ivy.xml index cbe45d8c33..dc1896882f 100644 --- a/Core/ivy.xml +++ b/Core/ivy.xml @@ -33,6 +33,7 @@ + diff --git a/Core/nbproject/project.properties b/Core/nbproject/project.properties index 89bf4a93d7..8dcea69608 100644 --- a/Core/nbproject/project.properties +++ b/Core/nbproject/project.properties @@ -6,6 +6,7 @@ file.reference.commons-pool2-2.4.2.jar=release\\modules\\ext\\commons-pool2-2.4. file.reference.dd-plist-1.20.jar=release/modules/ext/dd-plist-1.20.jar file.reference.jdom-2.0.5-contrib.jar=release/modules/ext/jdom-2.0.5-contrib.jar file.reference.jdom-2.0.5.jar=release/modules/ext/jdom-2.0.5.jar +file.reference.jericho-html-3.3.jar=C:\\cygwin64\\home\\dsmyda\\autopsy\\Core\\release\\modules\\ext\\jericho-html-3.3.jar file.reference.jgraphx-v3.8.0.jar=release/modules/ext/jgraphx-v3.8.0.jar file.reference.jsoup-1.10.3.jar=release/modules/ext/jsoup-1.10.3.jar file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml index 7a89519e34..2158785ecc 100644 --- a/Core/nbproject/project.xml +++ b/Core/nbproject/project.xml @@ -338,13 +338,10 @@ org.sleuthkit.autopsy.modules.vmextractor org.sleuthkit.autopsy.progress org.sleuthkit.autopsy.report - org.sleuthkit.autopsy.tabulardatareader + org.sleuthkit.autopsy.textextractors + org.sleuthkit.autopsy.textextractors.extractionconfigs org.sleuthkit.datamodel - - ext/jackcess-2.2.0.jar - release/modules/ext/jackcess-2.2.0.jar - ext/zookeeper-3.4.6.jar release/modules/ext/zookeeper-3.4.6.jar @@ -357,6 +354,18 @@ ext/cxf-rt-transports-http-3.0.16.jar release/modules/ext/cxf-rt-transports-http-3.0.16.jar + + ext/jackcess-2.2.0.jar + release/modules/ext/jackcess-2.2.0.jar + + + ext/sleuthkit-postgresql-4.6.4.jar + release/modules/ext/sleuthkit-postgresql-4.6.4.jar + + + ext/jericho-html-3.3.jar + release\modules\ext\jericho-html-3.3.jar + ext/commons-validator-1.6.jar release/modules/ext/commons-validator-1.6.jar @@ -393,14 +402,14 @@ ext/sevenzipjbinding.jar release/modules/ext/sevenzipjbinding.jar - - ext/sleuthkit-postgresql-4.6.4.jar - release/modules/ext/sleuthkit-postgresql-4.6.4.jar - ext/mchange-commons-java-0.2.9.jar release/modules/ext/mchange-commons-java-0.2.9.jar + + ext/jackcess-encrypt-2.1.4.jar + release/modules/ext/jackcess-encrypt-2.1.4.jar + ext/cxf-core-3.0.16.jar release/modules/ext/cxf-core-3.0.16.jar @@ -481,10 +490,6 @@ ext/commons-pool2-2.4.2.jar release\modules\ext\commons-pool2-2.4.2.jar - - ext/jackcess-encrypt-2.1.4.jar - release/modules/ext/jackcess-encrypt-2.1.4.jar - ext/jsoup-1.10.3.jar release/modules/ext/jsoup-1.10.3.jar diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java similarity index 92% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java index bba2df2ced..b97c1e3ab3 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; import java.io.Reader; import java.util.Arrays; @@ -27,9 +27,8 @@ import org.sleuthkit.datamodel.Content; * Common methods for utilities that extract text and content and divide into * chunks */ -abstract class ContentTextExtractor implements TextExtractor { - - +public abstract class ContentTextExtractor implements TextExtractor { + static final List BINARY_MIME_TYPES = Arrays.asList( //ignore binary blob data, for which string extraction will be used @@ -80,7 +79,7 @@ abstract class ContentTextExtractor implements TextExtractor { * * @return */ - abstract boolean isContentTypeSpecific(); + public abstract boolean isContentTypeSpecific(); /** * Determines if the file content is supported by the extractor if @@ -92,7 +91,9 @@ abstract class ContentTextExtractor implements TextExtractor { * * @return true if the file content is supported, false otherwise */ - abstract boolean isSupported(Content file, String detectedFormat); + public abstract boolean isSupported(Content file, String detectedFormat); + + public abstract void parseContext(ExtractionContext context); @Override public abstract Reader getReader(Content source) throws TextExtractorException; diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java new file mode 100755 index 0000000000..d291164a2e --- /dev/null +++ b/Core/src/org/sleuthkit/autopsy/textextractors/ExtractionContext.java @@ -0,0 +1,28 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package org.sleuthkit.autopsy.textextractors; + +import com.google.common.collect.ClassToInstanceMap; +import com.google.common.collect.MutableClassToInstanceMap; + +/** + * + * @author dsmyda + */ +public class ExtractionContext { + ClassToInstanceMap extractionConfigs; + + public ExtractionContext() { + extractionConfigs = MutableClassToInstanceMap.create(); + } + + public void set(Class configClass, T configInstance) { + extractionConfigs.put(configClass, configInstance); + } + public T get(Class configClass) { + return (T) extractionConfigs.get(configClass); + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java similarity index 88% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java index 32842dbc03..f834359514 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java @@ -16,13 +16,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.logging.Level; import net.htmlparser.jericho.Attributes; import net.htmlparser.jericho.Config; @@ -32,16 +33,17 @@ import net.htmlparser.jericho.Source; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.textextractors.extractionconfigs.HTMLExtractionConfig; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; /** * Extracts text from HTML content. */ -class HtmlTextExtractor extends ContentTextExtractor { +public class HtmlTextExtractor extends ContentTextExtractor { static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); - private static final int MAX_SIZE = 50_000_000; //50MB + private int maxSize = 50_000_000; static final List WEB_MIME_TYPES = Arrays.asList( "application/javascript", //NON-NLS @@ -58,15 +60,16 @@ class HtmlTextExtractor extends ContentTextExtractor { } @Override - boolean isContentTypeSpecific() { + public boolean isContentTypeSpecific() { return true; } @Override - boolean isSupported(Content content, String detectedFormat) { - return detectedFormat != null - && WEB_MIME_TYPES.contains(detectedFormat) - && content.getSize() <= MAX_SIZE; + public boolean isSupported(Content content, String detectedFormat) { + boolean notNull = detectedFormat != null; + boolean supported = WEB_MIME_TYPES.contains(detectedFormat); + boolean size = content.getSize() <= maxSize;; + return notNull && supported && size; } @Override @@ -177,4 +180,12 @@ class HtmlTextExtractor extends ContentTextExtractor { public void logWarning(final String msg, Exception ex) { logger.log(Level.WARNING, msg, ex); //NON-NLS } } + + @Override + public void parseContext(ExtractionContext context) { + HTMLExtractionConfig configInstance = context.get(HTMLExtractionConfig.class); + if(Objects.nonNull(configInstance)) { + this.maxSize = configInstance.getContentSizeLimit(); + } + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java similarity index 97% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java index c8bbe289e4..37cd2cca87 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; import com.google.common.io.CharSource; import java.io.IOException; @@ -45,14 +45,14 @@ import org.sleuthkit.datamodel.TskCoreException; * tables with spaces in table name 3) Tika fails to include the table names in * output (except for the first table it parses) */ -class SqliteTextExtractor extends ContentTextExtractor { +public class SqliteTextExtractor extends ContentTextExtractor { private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private static final CharSequence EMPTY_CHARACTER_SEQUENCE = ""; @Override - boolean isContentTypeSpecific() { + public boolean isContentTypeSpecific() { return true; } @@ -75,7 +75,7 @@ class SqliteTextExtractor extends ContentTextExtractor { * @return true if x-sqlite3 */ @Override - boolean isSupported(Content file, String detectedFormat) { + public boolean isSupported(Content file, String detectedFormat) { return SQLITE_MIMETYPE.equals(detectedFormat); } @@ -106,6 +106,11 @@ class SqliteTextExtractor extends ContentTextExtractor { } } + @Override + public void parseContext(ExtractionContext context) { + //No settings. + } + /** * Lazily loads tables from the database during reading to conserve memory. */ diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteUtil.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteUtil.java similarity index 99% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteUtil.java rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteUtil.java index 08eefe7232..ae09972acf 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SqliteUtil.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteUtil.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; import java.io.File; import java.io.IOException; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java similarity index 92% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java index 391c7d5a7c..869ab8a992 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java @@ -16,19 +16,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.textextractors; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.Charset; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; +import org.sleuthkit.autopsy.textextractors.extractionconfigs.StringsExtractionConfig; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskException; @@ -36,21 +36,42 @@ import org.sleuthkit.datamodel.TskException; /** * Extracts raw strings from content. */ -class StringsTextExtractor extends ContentTextExtractor { - - static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); - +public class StringsTextExtractor extends ContentTextExtractor { + + private boolean extractUTF8; + private boolean extractUTF16; + /** * Options for this extractor */ - enum ExtractOptions { + public enum ExtractOptions { EXTRACT_UTF16, ///< extract UTF16 text, true/false EXTRACT_UTF8, ///< extract UTF8 text, true/false }; - private final List