diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml index 3559866d75..89e71fbc51 100644 --- a/Core/nbproject/project.xml +++ b/Core/nbproject/project.xml @@ -338,8 +338,8 @@ org.sleuthkit.autopsy.modules.vmextractor org.sleuthkit.autopsy.progress org.sleuthkit.autopsy.report - org.sleuthkit.autopsy.textreaders - org.sleuthkit.autopsy.textreaders.textreaderconfigs + org.sleuthkit.autopsy.textextractors + org.sleuthkit.autopsy.textextractors.textextractorconfigs org.sleuthkit.autopsy.texttranslation org.sleuthkit.datamodel diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java similarity index 96% rename from Core/src/org/sleuthkit/autopsy/textreaders/ArtifactTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java index be2fcc98f6..244cf17ab0 100644 --- a/Core/src/org/sleuthkit/autopsy/textreaders/ArtifactTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders; +package org.sleuthkit.autopsy.textextractors; import java.io.InputStreamReader; import java.io.Reader; @@ -83,7 +83,7 @@ class ArtifactTextExtractor extends TextExtractor { } @Override - public boolean isSupported(Content file, String detectedFormat) { + public boolean isSupported() { return true; } } diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/Bundle.properties b/Core/src/org/sleuthkit/autopsy/textextractors/Bundle.properties similarity index 100% rename from Core/src/org/sleuthkit/autopsy/textreaders/Bundle.properties rename to Core/src/org/sleuthkit/autopsy/textextractors/Bundle.properties diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/Bundle_ja.properties b/Core/src/org/sleuthkit/autopsy/textextractors/Bundle_ja.properties similarity index 100% rename from Core/src/org/sleuthkit/autopsy/textreaders/Bundle_ja.properties rename to Core/src/org/sleuthkit/autopsy/textextractors/Bundle_ja.properties diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java similarity index 95% rename from Core/src/org/sleuthkit/autopsy/textreaders/HtmlTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java index dcf24b038c..d3c34211d8 100644 --- a/Core/src/org/sleuthkit/autopsy/textreaders/HtmlTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders; +package org.sleuthkit.autopsy.textextractors; import java.io.IOException; import java.io.Reader; @@ -32,6 +32,7 @@ import net.htmlparser.jericho.Source; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -42,7 +43,7 @@ final class HtmlTextExtractor extends TextExtractor { static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); private final int MAX_SIZE; - private final Content file; + private final AbstractFile file; static final List WEB_MIME_TYPES = Arrays.asList( "application/javascript", //NON-NLS @@ -62,7 +63,7 @@ final class HtmlTextExtractor extends TextExtractor { * Creates a default instance of the HtmlTextExtractor. Supported file size * is 50MB. */ - public HtmlTextExtractor(Content file) { + public HtmlTextExtractor(AbstractFile file) { //Set default to be 50 MB. MAX_SIZE = 50_000_000; this.file = file; @@ -77,10 +78,10 @@ final class HtmlTextExtractor extends TextExtractor { * @return flag indicating support */ @Override - public boolean isSupported(Content content, String detectedFormat) { - return detectedFormat != null - && WEB_MIME_TYPES.contains(detectedFormat) - && content.getSize() <= MAX_SIZE; + public boolean isSupported() { + return file.getMIMEType() != null + && WEB_MIME_TYPES.contains(file.getMIMEType()) + && file.getSize() <= MAX_SIZE; } /** diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java similarity index 97% rename from Core/src/org/sleuthkit/autopsy/textreaders/SqliteTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java index 3b56a7c128..0c2cd28e76 100755 --- a/Core/src/org/sleuthkit/autopsy/textreaders/SqliteTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders; +package org.sleuthkit.autopsy.textextractors; import java.io.IOException; import java.io.Reader; @@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.SQLiteTableReader; import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.Content; /** * Extracts text from SQLite database files. @@ -45,8 +44,8 @@ final class SqliteTextExtractor extends TextExtractor { private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private final AbstractFile file; - public SqliteTextExtractor(Content file) { - this.file = (AbstractFile) file; + public SqliteTextExtractor(AbstractFile file) { + this.file = file; } /** * Supports only the sqlite mimetypes @@ -57,8 +56,8 @@ final class SqliteTextExtractor extends TextExtractor { * @return true if x-sqlite3 */ @Override - public boolean isSupported(Content file, String detectedFormat) { - return SQLITE_MIMETYPE.equals(detectedFormat); + public boolean isSupported() { + return SQLITE_MIMETYPE.equals(file.getMIMEType()); } /** diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java similarity index 98% rename from Core/src/org/sleuthkit/autopsy/textreaders/StringsTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java index 60ed556a0f..d271a83b9f 100644 --- a/Core/src/org/sleuthkit/autopsy/textreaders/StringsTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders; +package org.sleuthkit.autopsy.textextractors; import java.io.IOException; import java.io.InputStream; @@ -28,7 +28,7 @@ import java.util.Objects; import org.openide.util.Lookup; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; -import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig; +import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskException; @@ -36,7 +36,7 @@ import org.sleuthkit.datamodel.TskException; /** * Extracts raw strings from content. */ -final class StringsTextExtractor { +final class StringsTextExtractor extends TextExtractor { private boolean extractUTF8; private boolean extractUTF16; @@ -81,6 +81,7 @@ final class StringsTextExtractor { * @throws * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException */ + @Override public InputStreamReader getReader() { InputStream stringStream = getInputStream(content); return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET)); @@ -104,6 +105,7 @@ final class StringsTextExtractor { * * @param context Lookup instance containing config classes */ + @Override public void setExtractionSettings(Lookup context) { if (context != null) { StringsConfig configInstance = context.lookup(StringsConfig.class); @@ -126,14 +128,11 @@ final class StringsTextExtractor { * * @return */ - public boolean isEnabled() { + @Override + public boolean isSupported() { return extractUTF8 || extractUTF16; } - - boolean isSupported(Content file, String detectedFormat) { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. - } - + /** * Content input string stream reader/converter - given Content, extract * strings from it and return encoded bytes via read() diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java similarity index 75% rename from Core/src/org/sleuthkit/autopsy/textreaders/TextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java index ac4c740a34..e081926afd 100644 --- a/Core/src/org/sleuthkit/autopsy/textreaders/TextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java @@ -16,38 +16,26 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders; +package org.sleuthkit.autopsy.textextractors; import java.io.Reader; import org.openide.util.Lookup; -import org.sleuthkit.datamodel.Content; /** * Extracts the text out of Content instances and exposes them as a Reader. * Concrete implementations can be obtained from * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory} */ -abstract class TextExtractor { +public abstract class TextExtractor { /** - * Determines if the file content is supported by the extractor. + * Determines if this extractor supports the given Content and + * configurations passed into it in + * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}. * - * @param file to test if its content should be supported - * @param detectedFormat mime-type with detected format (such as text/plain) - * or null if not detected - * - * @return true if the file content is supported, false otherwise + * @return true if content is supported, false otherwise */ - abstract boolean isSupported(Content file, String detectedFormat); - - /** - * Determines if the TextExtractor instance is enabled to read content. - * - * @return - */ - boolean isEnabled() { - return true; - } + abstract boolean isSupported(); /** * Get a Reader that will iterate over the text extracted from the Content @@ -75,8 +63,7 @@ abstract class TextExtractor { } /** - * Exception encountered during - * {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}. + * Exception encountered during TextExtractor.getReader(). * This indicates that there was an internal parsing error that occurred * during the reading of Content text. */ diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java new file mode 100755 index 0000000000..22d4aa5040 --- /dev/null +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java @@ -0,0 +1,160 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2018-2018 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.textextractors; + +import java.util.Arrays; +import java.util.List; +import org.openide.util.Lookup; +import org.sleuthkit.datamodel.AbstractFile; +import org.sleuthkit.datamodel.BlackboardArtifact; +import org.sleuthkit.datamodel.Content; +import org.sleuthkit.datamodel.Report; + +/** + * Factory for creating TextExtractors given a Content instance + * + * See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for + * available extractor configuration options. + * + * @see org.openide.util.Lookup + */ +public class TextExtractorFactory { + + /** + * Returns a TextExtractor containing the Content text. Configuration files + * can be added to the Lookup. + * + * See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for + * available extractor configuration options. + * + * @param content Content source that will be read from + * @param context Contains extraction configurations for certain file types + * + * @return TextExtractor containing file text + * + * @throws NoTextExtractorFound Encountered when there is no Reader found + * for the given content type or there was an + * error while creating the reader. + * + * @see org.openide.util.Lookup + */ + public static TextExtractor getExtractor(Content content, Lookup context) throws NoTextExtractorFound { + if (content instanceof AbstractFile) { + for (TextExtractor extractor : getFileExtractors((AbstractFile) content, context)) { + if (extractor.isSupported()) { + return extractor; + } + } + } else if (content instanceof BlackboardArtifact) { + TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content); + artifactExtractor.setExtractionSettings(context); + return artifactExtractor; + } else if (content instanceof Report) { + TextExtractor reportExtractor = new TikaTextExtractor(content); + reportExtractor.setExtractionSettings(context); + return reportExtractor; + } + + throw new NoTextExtractorFound( + String.format("Could not find a suitable reader for " + + "content with name [%s] and id=[%d]. Try using " + + "the strings extractor instead.", + content.getName(), content.getId()) + ); + } + + /** + * Initializes, orders, and returns all file extractors that can read + * AbstractFile instances. + * + * @param content AbstractFile content + * @param context Lookup containing extractor configurations + * + * @return + */ + private static List getFileExtractors(AbstractFile content, Lookup context) { + List fileExtractors = Arrays.asList( + new HtmlTextExtractor(content), + new SqliteTextExtractor(content), + new TikaTextExtractor(content)); + + fileExtractors.forEach((fileExtractor) -> { + fileExtractor.setExtractionSettings(context); + }); + + return fileExtractors; + } + + /** + * Returns a TextExtractor containing the Content text. + * + * @param content Content instance that will be read from + * + * @return TextExtractor containing file text + * + * @throws NoTextExtractorFound Encountered when there is no Reader was + * found for the given content type. Use + * getStringsExtractor(Content,Lookup) method + * instead. + */ + public static TextExtractor getExtractor(Content content) throws NoTextExtractorFound { + return TextExtractorFactory.getExtractor(content, null); + } + + /** + * Returns a TextExtractor containing the Content strings. This method + * supports all content types. This method should be used as a backup in the + * event that no reader was found using getExtractor(Content) or + * getExtractor(Content, Lookup). + * + * Configure this extractor with the StringsConfig in + * {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} + * + * @param content Content source to read from + * @param context Contains extraction configurations for certain file types + * + * @return TextExtractor containing file text + * + * @see org.openide.util.Lookup + */ + public static TextExtractor getStringsExtractor(Content content, Lookup context) { + StringsTextExtractor stringsInstance = new StringsTextExtractor(content); + stringsInstance.setExtractionSettings(context); + return stringsInstance; + } + + /** + * System level exception for handling content types that have no specific + * strategy defined for extracting their text. + */ + public static class NoTextExtractorFound extends Exception { + + public NoTextExtractorFound(String msg) { + super(msg); + } + + public NoTextExtractorFound(Throwable ex) { + super(ex); + } + + private NoTextExtractorFound(String msg, Throwable ex) { + super(msg, ex); + } + } +} diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java similarity index 95% rename from Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java rename to Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index 985f897787..6582e05d3e 100644 --- a/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders; +package org.sleuthkit.autopsy.textextractors; import com.google.common.collect.ImmutableList; import com.google.common.io.CharSource; @@ -61,7 +61,7 @@ import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.coreutils.ExecUtil; import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator; import org.sleuthkit.autopsy.coreutils.PlatformUtil; -import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig; +import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig; import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.Content; @@ -318,7 +318,7 @@ final class TikaTextExtractor extends TextExtractor { } } } - + /** * Wraps the creation of a TikaReader into a Future so that it can be * cancelled. @@ -422,24 +422,27 @@ final class TikaTextExtractor extends TextExtractor { } /** - * Determines if Tika is supported for this content type and mimetype. - * - * @param content Source content to read - * @param detectedFormat Mimetype of content + * Determines if Tika is enabled for this content * * @return Flag indicating support for reading content type */ @Override - public boolean isSupported(Content content, String detectedFormat) { - if (detectedFormat == null - || BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used) - || ARCHIVE_MIME_TYPES.contains(detectedFormat) - || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS - || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS + public boolean isSupported() { + if(!(content instanceof AbstractFile)) { + return false; + } + + String detectedType = ((AbstractFile)content).getMIMEType(); + if (detectedType == null + || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used) + || ARCHIVE_MIME_TYPES.contains(detectedType) + || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS + || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS ) { return false; } - return TIKA_SUPPORTED_TYPES.contains(detectedFormat); + + return TIKA_SUPPORTED_TYPES.contains(detectedType); } /** diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/ImageConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/ImageConfig.java similarity index 85% rename from Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/ImageConfig.java rename to Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/ImageConfig.java index bfb3de7b38..81238387c3 100755 --- a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/ImageConfig.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/ImageConfig.java @@ -16,11 +16,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders.textreaderconfigs; +package org.sleuthkit.autopsy.textextractors.textextractorconfigs; /** - * Allows for configuration of OCR on image files. Readers that use ImageConfig - * can be obtained through {@link org.sleuthkit.autopsy.textreaders.TextReaders} + * Allows for configuration of OCR on image files. Extractors that use ImageConfig + * can be obtained through TextExtractoryFactory.getExtractor(). * * @see org.openide.util.Lookup */ diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/StringsConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/StringsConfig.java similarity index 88% rename from Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/StringsConfig.java rename to Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/StringsConfig.java index 3400b3a1e0..dbd99ba7b7 100755 --- a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/StringsConfig.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/StringsConfig.java @@ -16,20 +16,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.textreaders.textreaderconfigs; +package org.sleuthkit.autopsy.textextractors.textextractorconfigs; import java.util.List; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; /** - * Allows for configuration of the Reader obtained from - * {@link org.sleuthkit.autopsy.textreaders.TextReader#getStringsReader(Content, Lookup)}. + * Allows for configuration of the TextExtractor obtained from + * TextExtractorFactory.getExtractor(). * - * The strings reader will read strings from the Content instance. This class + * The strings extractor will extract strings from the Content instance. This class * allows for the configuration of the encoding and language scripts used during * reading. * - * @see org.sleuthkit.autopsy.textreaders.TextReaders + * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory * @see * org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT * @see org.openide.util.Lookup @@ -77,8 +77,8 @@ public class StringsConfig { } /** - * Sets the type of language scripts that will be used during this - * reading. See + * Sets the type of language scripts that will be used during this reading. + * See * {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT} * for more information about available scripts. * diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/TextReaders.java b/Core/src/org/sleuthkit/autopsy/textreaders/TextReaders.java deleted file mode 100755 index 8e54bd287a..0000000000 --- a/Core/src/org/sleuthkit/autopsy/textreaders/TextReaders.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2018-2018 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.textreaders; - -import java.io.Reader; -import java.util.Arrays; -import java.util.List; -import org.openide.util.Lookup; -import org.sleuthkit.autopsy.textreaders.TextExtractor.ExtractionException; -import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.BlackboardArtifact; -import org.sleuthkit.datamodel.Content; -import org.sleuthkit.datamodel.Report; - -/** - * Factory for creating Readers given a Content instance - * - * See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for available - * Reader configuration options. - * - * @see org.openide.util.Lookup - */ -public class TextReaders { - - /** - * Returns a reader containing the Content text. Configuration files can be - * added to the Lookup. - * - * See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for - * available Reader configuration options. - * - * @param content Content source that will be read from - * @param context Contains extraction configurations for certain file types - * - * @return Reader containing file text - * - * @throws NoTextReaderFound Encountered when there is no Reader found for - * the given content type or there was an error - * while creating the reader. - * - * @see org.openide.util.Lookup - */ - public static Reader getReader(Content content, - Lookup context) throws NoTextReaderFound { - try { - if (content instanceof AbstractFile) { - String mimeType = ((AbstractFile) content).getMIMEType(); - List extractors = Arrays.asList( - new HtmlTextExtractor(content), - new SqliteTextExtractor(content), - new TikaTextExtractor(content)); - for (TextExtractor extractor : extractors) { - extractor.setExtractionSettings(context); - if (extractor.isEnabled() && extractor.isSupported(content, mimeType)) { - return extractor.getReader(); - } - } - } else if (content instanceof BlackboardArtifact) { - TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content); - artifactExtractor.setExtractionSettings(context); - return artifactExtractor.getReader(); - } else if (content instanceof Report) { - TextExtractor reportExtractor = new TikaTextExtractor(content); - reportExtractor.setExtractionSettings(context); - return reportExtractor.getReader(); - } - } catch (ExtractionException ex) { - throw new NoTextReaderFound("Error while getting reader", ex); - } - - throw new NoTextReaderFound( - String.format("Could not find a suitable reader for " - + "content with name [%s] and id=[%d]. Try using " - + "the default reader instead.", - content.getName(), content.getId()) - ); - } - - /** - * Returns a reader containing the Content text. - * - * @param content Content instance that will be read from - * - * @return Reader containing file text - * - * @throws NoTextReaderFound Encountered when there is no Reader was found - * for the given content type. Use - * getStringsReader(Content,Lookup) method - * instead. - */ - public static Reader getReader(Content content) - throws NoTextReaderFound { - return TextReaders.getReader(content, null); - } - - /** - * Returns a Reader containing the Content strings. This method supports all - * content types. This method should be used as a backup in the event that - * no reader was found using getReader(Content) or getReader(Content, - * Lookup). - * - * Configure this reader with the StringsConfig in - * {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} - * - * @param content Content source to read from - * @param context Contains extraction configurations for certain file types - * - * @return Reader containing file text - * - * @see org.openide.util.Lookup - */ - public static Reader getStringsReader(Content content, Lookup context) { - StringsTextExtractor stringsInstance = new StringsTextExtractor(content); - stringsInstance.setExtractionSettings(context); - return stringsInstance.getReader(); - } - - /** - * System level exception for handling content types that have no specific - * strategy defined for extracting their text. - */ - public static class NoTextReaderFound extends Exception { - - public NoTextReaderFound(String msg) { - super(msg); - } - - public NoTextReaderFound(Throwable ex) { - super(ex); - } - - private NoTextReaderFound(String msg, Throwable ex) { - super(msg, ex); - } - } -} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index 32974f0ad2..ea3fbec3f3 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; +import org.openide.util.Exceptions; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.NbBundle.Messages; @@ -44,9 +45,10 @@ import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorEx import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; -import org.sleuthkit.autopsy.textreaders.TextReaders; -import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig; -import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig; +import org.sleuthkit.autopsy.textextractors.TextExtractor; +import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; +import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig; +import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.TskData; import org.sleuthkit.datamodel.TskData.FileKnown; @@ -480,10 +482,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule { Lookup extractionContext = Lookups.fixed(imageConfig); try { - Reader specializedReader = TextReaders.getReader(aFile,extractionContext); + TextExtractor extractor = TextExtractorFactory.getExtractor(aFile,extractionContext); + Reader extractedTextReader = extractor.getReader(); //divide into chunks and index - return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context); - } catch (TextReaders.NoTextReaderFound ex) { + return Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, context); + } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) { //No text extractor found... run the default instead return false; } @@ -502,8 +505,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule { if (context.fileIngestIsCancelled()) { return true; } - Reader stringsReader = TextReaders.getStringsReader(aFile, stringsExtractionContext); - if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) { + TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext); + Reader extractedTextReader = stringsExtractor.getReader(); + if (Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) { putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED); return true; } else { @@ -511,7 +515,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT); return false; } - } catch (IngesterException ex) { + } catch (IngesterException | TextExtractor.ExtractionException ex) { logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING); return false; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java index f670bca0fb..f87b145e28 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java @@ -46,7 +46,8 @@ import org.sleuthkit.autopsy.appservices.AutopsyService; import org.sleuthkit.autopsy.progress.ProgressIndicator; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; -import org.sleuthkit.autopsy.textreaders.TextReaders; +import org.sleuthkit.autopsy.textextractors.TextExtractor; +import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; @@ -114,22 +115,26 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService { return; } try { - Reader blackboardReader = TextReaders.getReader(content, null); + TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor(content, null); + Reader blackboardExtractedTextReader = blackboardExtractor.getReader(); String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID(); ingester.indexMetaDataOnly(artifact, sourceName); - ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null); - } catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) { + ingester.indexText(blackboardExtractedTextReader, artifact.getArtifactID(), sourceName, content, null); + } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) { throw new TskCoreException(ex.getCause().getMessage(), ex); } } else { try { - Reader contentReader = TextReaders.getReader(content, null); - ingester.indexText(contentReader, content.getId(), content.getName(), content, null); - } catch (TextReaders.NoTextReaderFound | Ingester.IngesterException ex) { + TextExtractor contentExtractor = TextExtractorFactory.getExtractor(content, null); + Reader contentExtractedTextReader = contentExtractor.getReader(); + ingester.indexText(contentExtractedTextReader, content.getId(), content.getName(), content, null); + } catch (TextExtractorFactory.NoTextExtractorFound | Ingester.IngesterException | TextExtractor.ExtractionException ex) { try { // Try the StringsTextExtractor if Tika extractions fails. - ingester.indexText(TextReaders.getStringsReader(content, null),content.getId(),content.getName(), content, null); - } catch (Ingester.IngesterException ex1) { + TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(content, null); + Reader stringsExtractedTextReader = stringsExtractor.getReader(); + ingester.indexText(stringsExtractedTextReader,content.getId(),content.getName(), content, null); + } catch (Ingester.IngesterException | TextExtractor.ExtractionException ex1) { throw new TskCoreException(ex.getCause().getMessage(), ex1); } } @@ -443,10 +448,11 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService { try { String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID(); - Reader contentSpecificReader = TextReaders.getReader((Content) artifact, null); + TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor((Content) artifact, null); + Reader blackboardExtractedTextReader = blackboardExtractor.getReader(); ingester.indexMetaDataOnly(artifact, sourceName); - ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null); - } catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) { + ingester.indexText(blackboardExtractedTextReader, artifact.getId(), sourceName, artifact, null); + } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) { throw new TskCoreException(ex.getCause().getMessage(), ex); } }