diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml
index 3559866d75..89e71fbc51 100644
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@@ -338,8 +338,8 @@
org.sleuthkit.autopsy.modules.vmextractor
org.sleuthkit.autopsy.progress
org.sleuthkit.autopsy.report
- org.sleuthkit.autopsy.textreaders
- org.sleuthkit.autopsy.textreaders.textreaderconfigs
+ org.sleuthkit.autopsy.textextractors
+ org.sleuthkit.autopsy.textextractors.textextractorconfigs
org.sleuthkit.autopsy.texttranslation
org.sleuthkit.datamodel
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
similarity index 96%
rename from Core/src/org/sleuthkit/autopsy/textreaders/ArtifactTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
index be2fcc98f6..244cf17ab0 100644
--- a/Core/src/org/sleuthkit/autopsy/textreaders/ArtifactTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java
@@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders;
+package org.sleuthkit.autopsy.textextractors;
import java.io.InputStreamReader;
import java.io.Reader;
@@ -83,7 +83,7 @@ class ArtifactTextExtractor extends TextExtractor {
}
@Override
- public boolean isSupported(Content file, String detectedFormat) {
+ public boolean isSupported() {
return true;
}
}
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/Bundle.properties b/Core/src/org/sleuthkit/autopsy/textextractors/Bundle.properties
similarity index 100%
rename from Core/src/org/sleuthkit/autopsy/textreaders/Bundle.properties
rename to Core/src/org/sleuthkit/autopsy/textextractors/Bundle.properties
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/Bundle_ja.properties b/Core/src/org/sleuthkit/autopsy/textextractors/Bundle_ja.properties
similarity index 100%
rename from Core/src/org/sleuthkit/autopsy/textreaders/Bundle_ja.properties
rename to Core/src/org/sleuthkit/autopsy/textextractors/Bundle_ja.properties
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
similarity index 95%
rename from Core/src/org/sleuthkit/autopsy/textreaders/HtmlTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
index dcf24b038c..d3c34211d8 100644
--- a/Core/src/org/sleuthkit/autopsy/textreaders/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders;
+package org.sleuthkit.autopsy.textextractors;
import java.io.IOException;
import java.io.Reader;
@@ -32,6 +32,7 @@ import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream;
@@ -42,7 +43,7 @@ final class HtmlTextExtractor extends TextExtractor {
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private final int MAX_SIZE;
- private final Content file;
+ private final AbstractFile file;
static final List WEB_MIME_TYPES = Arrays.asList(
"application/javascript", //NON-NLS
@@ -62,7 +63,7 @@ final class HtmlTextExtractor extends TextExtractor {
* Creates a default instance of the HtmlTextExtractor. Supported file size
* is 50MB.
*/
- public HtmlTextExtractor(Content file) {
+ public HtmlTextExtractor(AbstractFile file) {
//Set default to be 50 MB.
MAX_SIZE = 50_000_000;
this.file = file;
@@ -77,10 +78,10 @@ final class HtmlTextExtractor extends TextExtractor {
* @return flag indicating support
*/
@Override
- public boolean isSupported(Content content, String detectedFormat) {
- return detectedFormat != null
- && WEB_MIME_TYPES.contains(detectedFormat)
- && content.getSize() <= MAX_SIZE;
+ public boolean isSupported() {
+ return file.getMIMEType() != null
+ && WEB_MIME_TYPES.contains(file.getMIMEType())
+ && file.getSize() <= MAX_SIZE;
}
/**
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
similarity index 97%
rename from Core/src/org/sleuthkit/autopsy/textreaders/SqliteTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
index 3b56a7c128..0c2cd28e76 100755
--- a/Core/src/org/sleuthkit/autopsy/textreaders/SqliteTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java
@@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders;
+package org.sleuthkit.autopsy.textextractors;
import java.io.IOException;
import java.io.Reader;
@@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
/**
* Extracts text from SQLite database files.
@@ -45,8 +44,8 @@ final class SqliteTextExtractor extends TextExtractor {
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
private final AbstractFile file;
- public SqliteTextExtractor(Content file) {
- this.file = (AbstractFile) file;
+ public SqliteTextExtractor(AbstractFile file) {
+ this.file = file;
}
/**
* Supports only the sqlite mimetypes
@@ -57,8 +56,8 @@ final class SqliteTextExtractor extends TextExtractor {
* @return true if x-sqlite3
*/
@Override
- public boolean isSupported(Content file, String detectedFormat) {
- return SQLITE_MIMETYPE.equals(detectedFormat);
+ public boolean isSupported() {
+ return SQLITE_MIMETYPE.equals(file.getMIMEType());
}
/**
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
similarity index 98%
rename from Core/src/org/sleuthkit/autopsy/textreaders/StringsTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
index 60ed556a0f..d271a83b9f 100644
--- a/Core/src/org/sleuthkit/autopsy/textreaders/StringsTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java
@@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders;
+package org.sleuthkit.autopsy.textextractors;
import java.io.IOException;
import java.io.InputStream;
@@ -28,7 +28,7 @@ import java.util.Objects;
import org.openide.util.Lookup;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
-import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig;
+import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskException;
@@ -36,7 +36,7 @@ import org.sleuthkit.datamodel.TskException;
/**
* Extracts raw strings from content.
*/
-final class StringsTextExtractor {
+final class StringsTextExtractor extends TextExtractor {
private boolean extractUTF8;
private boolean extractUTF16;
@@ -81,6 +81,7 @@ final class StringsTextExtractor {
* @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/
+ @Override
public InputStreamReader getReader() {
InputStream stringStream = getInputStream(content);
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
@@ -104,6 +105,7 @@ final class StringsTextExtractor {
*
* @param context Lookup instance containing config classes
*/
+ @Override
public void setExtractionSettings(Lookup context) {
if (context != null) {
StringsConfig configInstance = context.lookup(StringsConfig.class);
@@ -126,14 +128,11 @@ final class StringsTextExtractor {
*
* @return
*/
- public boolean isEnabled() {
+ @Override
+ public boolean isSupported() {
return extractUTF8 || extractUTF16;
}
-
- boolean isSupported(Content file, String detectedFormat) {
- throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
- }
-
+
/**
* Content input string stream reader/converter - given Content, extract
* strings from it and return encoded bytes via read()
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
similarity index 75%
rename from Core/src/org/sleuthkit/autopsy/textreaders/TextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
index ac4c740a34..e081926afd 100644
--- a/Core/src/org/sleuthkit/autopsy/textreaders/TextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java
@@ -16,38 +16,26 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders;
+package org.sleuthkit.autopsy.textextractors;
import java.io.Reader;
import org.openide.util.Lookup;
-import org.sleuthkit.datamodel.Content;
/**
* Extracts the text out of Content instances and exposes them as a Reader.
* Concrete implementations can be obtained from
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}
*/
-abstract class TextExtractor {
+public abstract class TextExtractor {
/**
- * Determines if the file content is supported by the extractor.
+ * Determines if this extractor supports the given Content and
+ * configurations passed into it in
+ * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}.
*
- * @param file to test if its content should be supported
- * @param detectedFormat mime-type with detected format (such as text/plain)
- * or null if not detected
- *
- * @return true if the file content is supported, false otherwise
+ * @return true if content is supported, false otherwise
*/
- abstract boolean isSupported(Content file, String detectedFormat);
-
- /**
- * Determines if the TextExtractor instance is enabled to read content.
- *
- * @return
- */
- boolean isEnabled() {
- return true;
- }
+ abstract boolean isSupported();
/**
* Get a Reader that will iterate over the text extracted from the Content
@@ -75,8 +63,7 @@ abstract class TextExtractor {
}
/**
- * Exception encountered during
- * {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
+ * Exception encountered during TextExtractor.getReader().
* This indicates that there was an internal parsing error that occurred
* during the reading of Content text.
*/
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
new file mode 100755
index 0000000000..22d4aa5040
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
@@ -0,0 +1,160 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2018-2018 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.textextractors;
+
+import java.util.Arrays;
+import java.util.List;
+import org.openide.util.Lookup;
+import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.BlackboardArtifact;
+import org.sleuthkit.datamodel.Content;
+import org.sleuthkit.datamodel.Report;
+
+/**
+ * Factory for creating TextExtractors given a Content instance
+ *
+ * See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for
+ * available extractor configuration options.
+ *
+ * @see org.openide.util.Lookup
+ */
+public class TextExtractorFactory {
+
+ /**
+ * Returns a TextExtractor containing the Content text. Configuration files
+ * can be added to the Lookup.
+ *
+ * See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for
+ * available extractor configuration options.
+ *
+ * @param content Content source that will be read from
+ * @param context Contains extraction configurations for certain file types
+ *
+ * @return TextExtractor containing file text
+ *
+ * @throws NoTextExtractorFound Encountered when there is no Reader found
+ * for the given content type or there was an
+ * error while creating the reader.
+ *
+ * @see org.openide.util.Lookup
+ */
+ public static TextExtractor getExtractor(Content content, Lookup context) throws NoTextExtractorFound {
+ if (content instanceof AbstractFile) {
+ for (TextExtractor extractor : getFileExtractors((AbstractFile) content, context)) {
+ if (extractor.isSupported()) {
+ return extractor;
+ }
+ }
+ } else if (content instanceof BlackboardArtifact) {
+ TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
+ artifactExtractor.setExtractionSettings(context);
+ return artifactExtractor;
+ } else if (content instanceof Report) {
+ TextExtractor reportExtractor = new TikaTextExtractor(content);
+ reportExtractor.setExtractionSettings(context);
+ return reportExtractor;
+ }
+
+ throw new NoTextExtractorFound(
+ String.format("Could not find a suitable reader for "
+ + "content with name [%s] and id=[%d]. Try using "
+ + "the strings extractor instead.",
+ content.getName(), content.getId())
+ );
+ }
+
+ /**
+ * Initializes, orders, and returns all file extractors that can read
+ * AbstractFile instances.
+ *
+ * @param content AbstractFile content
+ * @param context Lookup containing extractor configurations
+ *
+ * @return
+ */
+ private static List getFileExtractors(AbstractFile content, Lookup context) {
+ List fileExtractors = Arrays.asList(
+ new HtmlTextExtractor(content),
+ new SqliteTextExtractor(content),
+ new TikaTextExtractor(content));
+
+ fileExtractors.forEach((fileExtractor) -> {
+ fileExtractor.setExtractionSettings(context);
+ });
+
+ return fileExtractors;
+ }
+
+ /**
+ * Returns a TextExtractor containing the Content text.
+ *
+ * @param content Content instance that will be read from
+ *
+ * @return TextExtractor containing file text
+ *
+ * @throws NoTextExtractorFound Encountered when there is no Reader was
+ * found for the given content type. Use
+ * getStringsExtractor(Content,Lookup) method
+ * instead.
+ */
+ public static TextExtractor getExtractor(Content content) throws NoTextExtractorFound {
+ return TextExtractorFactory.getExtractor(content, null);
+ }
+
+ /**
+ * Returns a TextExtractor containing the Content strings. This method
+ * supports all content types. This method should be used as a backup in the
+ * event that no reader was found using getExtractor(Content) or
+ * getExtractor(Content, Lookup).
+ *
+ * Configure this extractor with the StringsConfig in
+ * {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs}
+ *
+ * @param content Content source to read from
+ * @param context Contains extraction configurations for certain file types
+ *
+ * @return TextExtractor containing file text
+ *
+ * @see org.openide.util.Lookup
+ */
+ public static TextExtractor getStringsExtractor(Content content, Lookup context) {
+ StringsTextExtractor stringsInstance = new StringsTextExtractor(content);
+ stringsInstance.setExtractionSettings(context);
+ return stringsInstance;
+ }
+
+ /**
+ * System level exception for handling content types that have no specific
+ * strategy defined for extracting their text.
+ */
+ public static class NoTextExtractorFound extends Exception {
+
+ public NoTextExtractorFound(String msg) {
+ super(msg);
+ }
+
+ public NoTextExtractorFound(Throwable ex) {
+ super(ex);
+ }
+
+ private NoTextExtractorFound(String msg, Throwable ex) {
+ super(msg, ex);
+ }
+ }
+}
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
similarity index 95%
rename from Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 985f897787..6582e05d3e 100644
--- a/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders;
+package org.sleuthkit.autopsy.textextractors;
import com.google.common.collect.ImmutableList;
import com.google.common.io.CharSource;
@@ -61,7 +61,7 @@ import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.ExecUtil;
import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
-import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig;
+import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
@@ -318,7 +318,7 @@ final class TikaTextExtractor extends TextExtractor {
}
}
}
-
+
/**
* Wraps the creation of a TikaReader into a Future so that it can be
* cancelled.
@@ -422,24 +422,27 @@ final class TikaTextExtractor extends TextExtractor {
}
/**
- * Determines if Tika is supported for this content type and mimetype.
- *
- * @param content Source content to read
- * @param detectedFormat Mimetype of content
+ * Determines if Tika is enabled for this content
*
* @return Flag indicating support for reading content type
*/
@Override
- public boolean isSupported(Content content, String detectedFormat) {
- if (detectedFormat == null
- || BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
- || ARCHIVE_MIME_TYPES.contains(detectedFormat)
- || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
- || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
+ public boolean isSupported() {
+ if(!(content instanceof AbstractFile)) {
+ return false;
+ }
+
+ String detectedType = ((AbstractFile)content).getMIMEType();
+ if (detectedType == null
+ || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
+ || ARCHIVE_MIME_TYPES.contains(detectedType)
+ || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
+ || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
) {
return false;
}
- return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
+
+ return TIKA_SUPPORTED_TYPES.contains(detectedType);
}
/**
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/ImageConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/ImageConfig.java
similarity index 85%
rename from Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/ImageConfig.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/ImageConfig.java
index bfb3de7b38..81238387c3 100755
--- a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/ImageConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/ImageConfig.java
@@ -16,11 +16,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders.textreaderconfigs;
+package org.sleuthkit.autopsy.textextractors.textextractorconfigs;
/**
- * Allows for configuration of OCR on image files. Readers that use ImageConfig
- * can be obtained through {@link org.sleuthkit.autopsy.textreaders.TextReaders}
+ * Allows for configuration of OCR on image files. Extractors that use ImageConfig
+ * can be obtained through TextExtractoryFactory.getExtractor().
*
* @see org.openide.util.Lookup
*/
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/StringsConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/StringsConfig.java
similarity index 88%
rename from Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/StringsConfig.java
rename to Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/StringsConfig.java
index 3400b3a1e0..dbd99ba7b7 100755
--- a/Core/src/org/sleuthkit/autopsy/textreaders/textreaderconfigs/StringsConfig.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/textextractorconfigs/StringsConfig.java
@@ -16,20 +16,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.textreaders.textreaderconfigs;
+package org.sleuthkit.autopsy.textextractors.textextractorconfigs;
import java.util.List;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
/**
- * Allows for configuration of the Reader obtained from
- * {@link org.sleuthkit.autopsy.textreaders.TextReader#getStringsReader(Content, Lookup)}.
+ * Allows for configuration of the TextExtractor obtained from
+ * TextExtractorFactory.getExtractor().
*
- * The strings reader will read strings from the Content instance. This class
+ * The strings extractor will extract strings from the Content instance. This class
* allows for the configuration of the encoding and language scripts used during
* reading.
*
- * @see org.sleuthkit.autopsy.textreaders.TextReaders
+ * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
* @see
* org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
* @see org.openide.util.Lookup
@@ -77,8 +77,8 @@ public class StringsConfig {
}
/**
- * Sets the type of language scripts that will be used during this
- * reading. See
+ * Sets the type of language scripts that will be used during this reading.
+ * See
* {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT}
* for more information about available scripts.
*
diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/TextReaders.java b/Core/src/org/sleuthkit/autopsy/textreaders/TextReaders.java
deleted file mode 100755
index 8e54bd287a..0000000000
--- a/Core/src/org/sleuthkit/autopsy/textreaders/TextReaders.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2018-2018 Basis Technology Corp.
- * Contact: carrier sleuthkit org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.sleuthkit.autopsy.textreaders;
-
-import java.io.Reader;
-import java.util.Arrays;
-import java.util.List;
-import org.openide.util.Lookup;
-import org.sleuthkit.autopsy.textreaders.TextExtractor.ExtractionException;
-import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.BlackboardArtifact;
-import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.Report;
-
-/**
- * Factory for creating Readers given a Content instance
- *
- * See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for available
- * Reader configuration options.
- *
- * @see org.openide.util.Lookup
- */
-public class TextReaders {
-
- /**
- * Returns a reader containing the Content text. Configuration files can be
- * added to the Lookup.
- *
- * See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for
- * available Reader configuration options.
- *
- * @param content Content source that will be read from
- * @param context Contains extraction configurations for certain file types
- *
- * @return Reader containing file text
- *
- * @throws NoTextReaderFound Encountered when there is no Reader found for
- * the given content type or there was an error
- * while creating the reader.
- *
- * @see org.openide.util.Lookup
- */
- public static Reader getReader(Content content,
- Lookup context) throws NoTextReaderFound {
- try {
- if (content instanceof AbstractFile) {
- String mimeType = ((AbstractFile) content).getMIMEType();
- List extractors = Arrays.asList(
- new HtmlTextExtractor(content),
- new SqliteTextExtractor(content),
- new TikaTextExtractor(content));
- for (TextExtractor extractor : extractors) {
- extractor.setExtractionSettings(context);
- if (extractor.isEnabled() && extractor.isSupported(content, mimeType)) {
- return extractor.getReader();
- }
- }
- } else if (content instanceof BlackboardArtifact) {
- TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
- artifactExtractor.setExtractionSettings(context);
- return artifactExtractor.getReader();
- } else if (content instanceof Report) {
- TextExtractor reportExtractor = new TikaTextExtractor(content);
- reportExtractor.setExtractionSettings(context);
- return reportExtractor.getReader();
- }
- } catch (ExtractionException ex) {
- throw new NoTextReaderFound("Error while getting reader", ex);
- }
-
- throw new NoTextReaderFound(
- String.format("Could not find a suitable reader for "
- + "content with name [%s] and id=[%d]. Try using "
- + "the default reader instead.",
- content.getName(), content.getId())
- );
- }
-
- /**
- * Returns a reader containing the Content text.
- *
- * @param content Content instance that will be read from
- *
- * @return Reader containing file text
- *
- * @throws NoTextReaderFound Encountered when there is no Reader was found
- * for the given content type. Use
- * getStringsReader(Content,Lookup) method
- * instead.
- */
- public static Reader getReader(Content content)
- throws NoTextReaderFound {
- return TextReaders.getReader(content, null);
- }
-
- /**
- * Returns a Reader containing the Content strings. This method supports all
- * content types. This method should be used as a backup in the event that
- * no reader was found using getReader(Content) or getReader(Content,
- * Lookup).
- *
- * Configure this reader with the StringsConfig in
- * {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs}
- *
- * @param content Content source to read from
- * @param context Contains extraction configurations for certain file types
- *
- * @return Reader containing file text
- *
- * @see org.openide.util.Lookup
- */
- public static Reader getStringsReader(Content content, Lookup context) {
- StringsTextExtractor stringsInstance = new StringsTextExtractor(content);
- stringsInstance.setExtractionSettings(context);
- return stringsInstance.getReader();
- }
-
- /**
- * System level exception for handling content types that have no specific
- * strategy defined for extracting their text.
- */
- public static class NoTextReaderFound extends Exception {
-
- public NoTextReaderFound(String msg) {
- super(msg);
- }
-
- public NoTextReaderFound(Throwable ex) {
- super(ex);
- }
-
- private NoTextReaderFound(String msg, Throwable ex) {
- super(msg, ex);
- }
- }
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index 32974f0ad2..ea3fbec3f3 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -25,6 +25,7 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
+import org.openide.util.Exceptions;
import org.openide.util.Lookup;
import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages;
@@ -44,9 +45,10 @@ import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorEx
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
-import org.sleuthkit.autopsy.textreaders.TextReaders;
-import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig;
-import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig;
+import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown;
@@ -480,10 +482,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
Lookup extractionContext = Lookups.fixed(imageConfig);
try {
- Reader specializedReader = TextReaders.getReader(aFile,extractionContext);
+ TextExtractor extractor = TextExtractorFactory.getExtractor(aFile,extractionContext);
+ Reader extractedTextReader = extractor.getReader();
//divide into chunks and index
- return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
- } catch (TextReaders.NoTextReaderFound ex) {
+ return Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, context);
+ } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
//No text extractor found... run the default instead
return false;
}
@@ -502,8 +505,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) {
return true;
}
- Reader stringsReader = TextReaders.getStringsReader(aFile, stringsExtractionContext);
- if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
+ TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
+ Reader extractedTextReader = stringsExtractor.getReader();
+ if (Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
return true;
} else {
@@ -511,7 +515,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
return false;
}
- } catch (IngesterException ex) {
+ } catch (IngesterException | TextExtractor.ExtractionException ex) {
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
return false;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
index f670bca0fb..f87b145e28 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@@ -46,7 +46,8 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
import org.sleuthkit.autopsy.progress.ProgressIndicator;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
-import org.sleuthkit.autopsy.textreaders.TextReaders;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
@@ -114,22 +115,26 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
return;
}
try {
- Reader blackboardReader = TextReaders.getReader(content, null);
+ TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor(content, null);
+ Reader blackboardExtractedTextReader = blackboardExtractor.getReader();
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
ingester.indexMetaDataOnly(artifact, sourceName);
- ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
- } catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) {
+ ingester.indexText(blackboardExtractedTextReader, artifact.getArtifactID(), sourceName, content, null);
+ } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex);
}
} else {
try {
- Reader contentReader = TextReaders.getReader(content, null);
- ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
- } catch (TextReaders.NoTextReaderFound | Ingester.IngesterException ex) {
+ TextExtractor contentExtractor = TextExtractorFactory.getExtractor(content, null);
+ Reader contentExtractedTextReader = contentExtractor.getReader();
+ ingester.indexText(contentExtractedTextReader, content.getId(), content.getName(), content, null);
+ } catch (TextExtractorFactory.NoTextExtractorFound | Ingester.IngesterException | TextExtractor.ExtractionException ex) {
try {
// Try the StringsTextExtractor if Tika extractions fails.
- ingester.indexText(TextReaders.getStringsReader(content, null),content.getId(),content.getName(), content, null);
- } catch (Ingester.IngesterException ex1) {
+ TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(content, null);
+ Reader stringsExtractedTextReader = stringsExtractor.getReader();
+ ingester.indexText(stringsExtractedTextReader,content.getId(),content.getName(), content, null);
+ } catch (Ingester.IngesterException | TextExtractor.ExtractionException ex1) {
throw new TskCoreException(ex.getCause().getMessage(), ex1);
}
}
@@ -443,10 +448,11 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
try {
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
- Reader contentSpecificReader = TextReaders.getReader((Content) artifact, null);
+ TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor((Content) artifact, null);
+ Reader blackboardExtractedTextReader = blackboardExtractor.getReader();
ingester.indexMetaDataOnly(artifact, sourceName);
- ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
- } catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) {
+ ingester.indexText(blackboardExtractedTextReader, artifact.getId(), sourceName, artifact, null);
+ } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex);
}
}