diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java index 00101b01ec..0cca74aef7 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/ArtifactTextExtractor.java @@ -22,7 +22,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.StandardCharsets; -import java.util.logging.Level; import org.apache.commons.io.IOUtils; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.datamodel.ContentUtils; @@ -35,39 +34,27 @@ import org.sleuthkit.datamodel.TskCoreException; * Extracts text from artifacts by concatenating the values of all of the * artifact's attributes. */ -class ArtifactTextExtractor extends ContentTextExtractor { +class ArtifactTextExtractor implements TextExtractor { static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName()); - @Override - public boolean isDisabled() { - return false; - } - - @Override - public void logWarning(final String msg, Exception ex) { - logger.log(Level.WARNING, msg, ex); //NON-NLS } - } - - private InputStream getInputStream(Content artifact) throws TextExtractorException { - BlackboardArtifact art = (BlackboardArtifact)artifact; - + private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException { // Concatenate the string values of all attributes into a single // "content" string to be indexed. StringBuilder artifactContents = new StringBuilder(); Content dataSource = null; try { - dataSource = art.getDataSource(); + dataSource = artifact.getDataSource(); } catch (TskCoreException tskCoreException) { - throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException); + throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException); } if (dataSource == null) { - throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString()); + throw new InitReaderException("Datasource was null for artifact: " + artifact.toString()); } try { - for (BlackboardAttribute attribute : art.getAttributes()) { + for (BlackboardAttribute attribute : artifact.getAttributes()) { artifactContents.append(attribute.getAttributeType().getDisplayName()); artifactContents.append(" : "); // We have also discussed modifying BlackboardAttribute.getDisplayString() @@ -85,40 +72,31 @@ class ArtifactTextExtractor extends ContentTextExtractor { artifactContents.append(System.lineSeparator()); } } catch (TskCoreException tskCoreException) { - throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException); + throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException); } return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8); } @Override - public Reader getReader(Content source) throws TextExtractorException { + public Reader getReader(BlackboardArtifact source) throws InitReaderException { return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8); } - @Override - public long getID(Content source) { - BlackboardArtifact art = (BlackboardArtifact)source; - return art.getArtifactID(); - } - - @Override - public String getName(Content source) { - BlackboardArtifact art = (BlackboardArtifact)source; - return art.getDisplayName() + "_" + art.getArtifactID(); - } - + /** + * Configures this extractors to the settings stored in relevant config instances. + * + * This operation is a no-op since currently there are no configurable settings + * of the extraction process. + * + * @param context Instance containing file config settings + */ @Override public void setExtractionSettings(ExtractionContext context) { } @Override - public boolean isContentTypeSpecific() { - return true; - } - - @Override - public boolean isSupported(Content file, String detectedFormat) { + public boolean isSupported(BlackboardArtifact file, String detectedFormat) { return true; } } diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java deleted file mode 100644 index 52713facc8..0000000000 --- a/Core/src/org/sleuthkit/autopsy/textextractors/ContentTextExtractor.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2011-2018 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.textextractors; - -import com.google.common.collect.ImmutableList; -import java.io.Reader; -import java.util.List; -import org.sleuthkit.datamodel.Content; - -/** - * Common methods for utilities that extract text and content and divide into - * chunks - * @param - */ -public abstract class ContentTextExtractor implements TextExtractor { - - //Mimetype groups to aassist extractor implementations in ignoring binary and - //archive files. - public static final List BINARY_MIME_TYPES - = ImmutableList.of( - //ignore binary blob data, for which string extraction will be used - "application/octet-stream", //NON-NLS - "application/x-msdownload"); //NON-NLS - - /** generally text extractors should ignore archives and let unpacking - * modules take care of them */ - public static final List ARCHIVE_MIME_TYPES - = ImmutableList.of( - //ignore unstructured binary and compressed data, for which string extraction or unzipper works better - "application/x-7z-compressed", //NON-NLS - "application/x-ace-compressed", //NON-NLS - "application/x-alz-compressed", //NON-NLS - "application/x-arj", //NON-NLS - "application/vnd.ms-cab-compressed", //NON-NLS - "application/x-cfs-compressed", //NON-NLS - "application/x-dgc-compressed", //NON-NLS - "application/x-apple-diskimage", //NON-NLS - "application/x-gca-compressed", //NON-NLS - "application/x-dar", //NON-NLS - "application/x-lzx", //NON-NLS - "application/x-lzh", //NON-NLS - "application/x-rar-compressed", //NON-NLS - "application/x-stuffit", //NON-NLS - "application/x-stuffitx", //NON-NLS - "application/x-gtar", //NON-NLS - "application/x-archive", //NON-NLS - "application/x-executable", //NON-NLS - "application/x-gzip", //NON-NLS - "application/zip", //NON-NLS - "application/x-zoo", //NON-NLS - "application/x-cpio", //NON-NLS - "application/x-shar", //NON-NLS - "application/x-tar", //NON-NLS - "application/x-bzip", //NON-NLS - "application/x-bzip2", //NON-NLS - "application/x-lzip", //NON-NLS - "application/x-lzma", //NON-NLS - "application/x-lzop", //NON-NLS - "application/x-z", //NON-NLS - "application/x-compress"); //NON-NLS - - /** - * Determines if the extractor works only for specified types is - * supportedTypes() or whether is a generic content extractor (such as - * string extractor) - * - * @return - */ - public abstract boolean isContentTypeSpecific(); - - /** - * Determines if the file content is supported by the extractor if - * isContentTypeSpecific() returns true. - * - * @param file to test if its content should be supported - * @param detectedFormat mime-type with detected format (such as text/plain) - * or null if not detected - * - * @return true if the file content is supported, false otherwise - */ - public abstract boolean isSupported(T file, String detectedFormat); - - /** - * Returns a reader that will iterate over the text of the source content. - * - * @param source Content source to read - * @return A reader that contains all source text - * @throws TextExtractorException Error encountered during extraction - */ - @Override - public abstract Reader getReader(T source) throws TextExtractorException; - - /** - * Get the object id of the content source. - * - * @param source source content - * @return object id associated with this source content - */ - @Override - public long getID(T source) { - return source.getId(); - } - - /** - * Returns the human-readable name of the given content source. - * - * @param source source content - * @return name of source content - */ - @Override - public String getName(T source) { - return source.getName(); - } -} diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java index 3fb1ba2d1d..92db14a837 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java @@ -23,7 +23,6 @@ import java.io.Reader; import java.io.StringReader; import java.util.Arrays; import java.util.List; -import java.util.logging.Level; import net.htmlparser.jericho.Attributes; import net.htmlparser.jericho.Config; import net.htmlparser.jericho.LoggerProvider; @@ -33,13 +32,12 @@ import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; /** * Extracts text from HTML content. */ -final class HtmlTextExtractor extends ContentTextExtractor { +final class HtmlTextExtractor implements TextExtractor { static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); private final int MAX_SIZE; @@ -67,19 +65,6 @@ final class HtmlTextExtractor extends ContentTextExtractor MAX_SIZE = 50_000_000; } - /** - * Determines if this extractor is responsible for extracting only a - * specific type of media. - * - * In this case, only HTML documents can be read successfully. - * - * @return true - */ - @Override - public boolean isContentTypeSpecific() { - return true; - } - /** * Determines if this content type is supported by this extractor. * @@ -89,7 +74,7 @@ final class HtmlTextExtractor extends ContentTextExtractor * @return flag indicating support */ @Override - public boolean isSupported(Content content, String detectedFormat) { + public boolean isSupported(AbstractFile content, String detectedFormat) { return detectedFormat != null && WEB_MIME_TYPES.contains(detectedFormat) && content.getSize() <= MAX_SIZE; @@ -105,7 +90,7 @@ final class HtmlTextExtractor extends ContentTextExtractor * @throws TextExtractorException */ @Override - public Reader getReader(Content content) throws TextExtractorException { + public Reader getReader(AbstractFile content) throws InitReaderException { //TODO JIRA-4467, there is only harm in excluding HTML documents greater //than 50MB due to our troubled approach of extraction. ReadContentInputStream stream = new ReadContentInputStream(content); @@ -201,25 +186,10 @@ final class HtmlTextExtractor extends ContentTextExtractor // All done, now make it a reader return new StringReader(stringBuilder.toString()); } catch (IOException ex) { - throw new TextExtractorException("Error extracting HTML from content.", ex); + throw new InitReaderException("Error extracting HTML from content.", ex); } } - /** - * Indicates if this extractor can run. - * - * @return Flag indicating if this extractor can run. - */ - @Override - public boolean isDisabled() { - return false; - } - - @Override - public void logWarning(final String msg, Exception ex) { - logger.log(Level.WARNING, msg, ex); //NON-NLS } - } - /** * Determines how the extraction process will proceed given the settings * stored in this context instance. diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java index a8f2ccaec0..09e8aa82eb 100755 --- a/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/SqliteTextExtractor.java @@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.SQLiteTableReader; import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.Content; /** * Extracts text from SQLite database files. @@ -39,48 +38,10 @@ import org.sleuthkit.datamodel.Content; * 2) Tables that contain spaces in their name are not extracted * 3) Table names are not included in its output text */ -final class SqliteTextExtractor extends ContentTextExtractor { +final class SqliteTextExtractor implements TextExtractor { private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); - private static boolean isDisabled; - - static { - try { - Class.forName("org.sqlite.JDBC"); - isDisabled = false; - } catch (ClassNotFoundException ex) { - logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, " - + "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS - isDisabled = true; - } - } - - /** - * This extractor only works for sqlite files, so it is indeed content type - * specific. - * - * @return true - */ - @Override - public boolean isContentTypeSpecific() { - return true; - } - - /** - * Determines if this extractor is fit to run. - * - * @return Flag indicating if it should or shouldn't be run. - */ - @Override - public boolean isDisabled() { - return isDisabled; - } - - @Override - public void logWarning(String msg, Exception exception) { - logger.log(Level.WARNING, msg, exception); //NON-NLS - } /** * Supports only the sqlite mimetypes @@ -91,7 +52,7 @@ final class SqliteTextExtractor extends ContentTextExtractor< * @return true if x-sqlite3 */ @Override - public boolean isSupported(Content file, String detectedFormat) { + public boolean isSupported(AbstractFile file, String detectedFormat) { return SQLITE_MIMETYPE.equals(detectedFormat); } @@ -105,12 +66,8 @@ final class SqliteTextExtractor extends ContentTextExtractor< * @throws TextExtractorException */ @Override - public Reader getReader(Content source) throws TextExtractorException { - if(source instanceof AbstractFile) { - return new SQLiteStreamReader((AbstractFile)source); - } - throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type" - + " AbstractFile.", source.getName(), source.getId())); + public Reader getReader(AbstractFile source) throws InitReaderException { + return new SQLiteStreamReader(source); } /** @@ -125,7 +82,7 @@ final class SqliteTextExtractor extends ContentTextExtractor< @Override public void setExtractionSettings(ExtractionContext context) { } - + /** * Produces a continuous stream of characters from a database file. To * achieve this, all table names are queues up and a SQLiteTableReader is diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java index ba7b913178..8fc5b3bdeb 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/StringsTextExtractor.java @@ -25,7 +25,6 @@ import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Objects; -import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; @@ -37,41 +36,12 @@ import org.sleuthkit.datamodel.TskException; /** * Extracts raw strings from content. */ -final class StringsTextExtractor extends ContentTextExtractor { +final class StringsTextExtractor { - static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); private boolean extractUTF8; private boolean extractUTF16; private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8"; - /** - * Determines if this extractor may only read particular types of content. - * - * Since Strings may be run on any content type, it is not content specific. - * - * @return false - */ - @Override - public boolean isContentTypeSpecific() { - return false; - } - - /** - * Determines if this extractor can read the content type. - * - * Note: Strings can be run on any type of content, so all types will return - * true. - * - * @param file Content source to read - * @param detectedFormat Mimetype of source file. - * - * @return true - */ - @Override - public boolean isSupported(Content file, String detectedFormat) { - return true; - } - private final List