Complete overhaul of how extractors are currently implemented, hopefully converging to a more sensible solution

This commit is contained in:
U-BASIS\dsmyda 2018-12-07 13:26:52 -05:00
parent ce548fb978
commit ece50a3a00
12 changed files with 241 additions and 530 deletions

View File

@ -22,7 +22,6 @@ import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.logging.Level;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.datamodel.ContentUtils;
@ -35,39 +34,27 @@ import org.sleuthkit.datamodel.TskCoreException;
* Extracts text from artifacts by concatenating the values of all of the * Extracts text from artifacts by concatenating the values of all of the
* artifact's attributes. * artifact's attributes.
*/ */
class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> { class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName()); static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
@Override private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException {
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
private InputStream getInputStream(Content artifact) throws TextExtractorException {
BlackboardArtifact art = (BlackboardArtifact)artifact;
// Concatenate the string values of all attributes into a single // Concatenate the string values of all attributes into a single
// "content" string to be indexed. // "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder(); StringBuilder artifactContents = new StringBuilder();
Content dataSource = null; Content dataSource = null;
try { try {
dataSource = art.getDataSource(); dataSource = artifact.getDataSource();
} catch (TskCoreException tskCoreException) { } catch (TskCoreException tskCoreException) {
throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException); throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
} }
if (dataSource == null) { if (dataSource == null) {
throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString()); throw new InitReaderException("Datasource was null for artifact: " + artifact.toString());
} }
try { try {
for (BlackboardAttribute attribute : art.getAttributes()) { for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName()); artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : "); artifactContents.append(" : ");
// We have also discussed modifying BlackboardAttribute.getDisplayString() // We have also discussed modifying BlackboardAttribute.getDisplayString()
@ -85,40 +72,31 @@ class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
artifactContents.append(System.lineSeparator()); artifactContents.append(System.lineSeparator());
} }
} catch (TskCoreException tskCoreException) { } catch (TskCoreException tskCoreException) {
throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException); throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
} }
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8); return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
} }
@Override @Override
public Reader getReader(Content source) throws TextExtractorException { public Reader getReader(BlackboardArtifact source) throws InitReaderException {
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8); return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
} }
@Override /**
public long getID(Content source) { * Configures this extractors to the settings stored in relevant config instances.
BlackboardArtifact art = (BlackboardArtifact)source; *
return art.getArtifactID(); * This operation is a no-op since currently there are no configurable settings
} * of the extraction process.
*
@Override * @param context Instance containing file config settings
public String getName(Content source) { */
BlackboardArtifact art = (BlackboardArtifact)source;
return art.getDisplayName() + "_" + art.getArtifactID();
}
@Override @Override
public void setExtractionSettings(ExtractionContext context) { public void setExtractionSettings(ExtractionContext context) {
} }
@Override @Override
public boolean isContentTypeSpecific() { public boolean isSupported(BlackboardArtifact file, String detectedFormat) {
return true;
}
@Override
public boolean isSupported(Content file, String detectedFormat) {
return true; return true;
} }
} }

View File

@ -1,130 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors;
import com.google.common.collect.ImmutableList;
import java.io.Reader;
import java.util.List;
import org.sleuthkit.datamodel.Content;
/**
* Common methods for utilities that extract text and content and divide into
* chunks
* @param <T>
*/
public abstract class ContentTextExtractor<T extends Content> implements TextExtractor<T> {
//Mimetype groups to aassist extractor implementations in ignoring binary and
//archive files.
public static final List<String> BINARY_MIME_TYPES
= ImmutableList.of(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
public static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/**
* Determines if the extractor works only for specified types is
* supportedTypes() or whether is a generic content extractor (such as
* string extractor)
*
* @return
*/
public abstract boolean isContentTypeSpecific();
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
*
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/
public abstract boolean isSupported(T file, String detectedFormat);
/**
* Returns a reader that will iterate over the text of the source content.
*
* @param source Content source to read
* @return A reader that contains all source text
* @throws TextExtractorException Error encountered during extraction
*/
@Override
public abstract Reader getReader(T source) throws TextExtractorException;
/**
* Get the object id of the content source.
*
* @param source source content
* @return object id associated with this source content
*/
@Override
public long getID(T source) {
return source.getId();
}
/**
* Returns the human-readable name of the given content source.
*
* @param source source content
* @return name of source content
*/
@Override
public String getName(T source) {
return source.getName();
}
}

View File

@ -23,7 +23,6 @@ import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.logging.Level;
import net.htmlparser.jericho.Attributes; import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Config; import net.htmlparser.jericho.Config;
import net.htmlparser.jericho.LoggerProvider; import net.htmlparser.jericho.LoggerProvider;
@ -33,13 +32,12 @@ import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType; import net.htmlparser.jericho.StartTagType;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
/** /**
* Extracts text from HTML content. * Extracts text from HTML content.
*/ */
final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T> { final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private final int MAX_SIZE; private final int MAX_SIZE;
@ -67,19 +65,6 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
MAX_SIZE = 50_000_000; MAX_SIZE = 50_000_000;
} }
/**
* Determines if this extractor is responsible for extracting only a
* specific type of media.
*
* In this case, only HTML documents can be read successfully.
*
* @return true
*/
@Override
public boolean isContentTypeSpecific() {
return true;
}
/** /**
* Determines if this content type is supported by this extractor. * Determines if this content type is supported by this extractor.
* *
@ -89,7 +74,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
* @return flag indicating support * @return flag indicating support
*/ */
@Override @Override
public boolean isSupported(Content content, String detectedFormat) { public boolean isSupported(AbstractFile content, String detectedFormat) {
return detectedFormat != null return detectedFormat != null
&& WEB_MIME_TYPES.contains(detectedFormat) && WEB_MIME_TYPES.contains(detectedFormat)
&& content.getSize() <= MAX_SIZE; && content.getSize() <= MAX_SIZE;
@ -105,7 +90,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
* @throws TextExtractorException * @throws TextExtractorException
*/ */
@Override @Override
public Reader getReader(Content content) throws TextExtractorException { public Reader getReader(AbstractFile content) throws InitReaderException {
//TODO JIRA-4467, there is only harm in excluding HTML documents greater //TODO JIRA-4467, there is only harm in excluding HTML documents greater
//than 50MB due to our troubled approach of extraction. //than 50MB due to our troubled approach of extraction.
ReadContentInputStream stream = new ReadContentInputStream(content); ReadContentInputStream stream = new ReadContentInputStream(content);
@ -201,25 +186,10 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
// All done, now make it a reader // All done, now make it a reader
return new StringReader(stringBuilder.toString()); return new StringReader(stringBuilder.toString());
} catch (IOException ex) { } catch (IOException ex) {
throw new TextExtractorException("Error extracting HTML from content.", ex); throw new InitReaderException("Error extracting HTML from content.", ex);
} }
} }
/**
* Indicates if this extractor can run.
*
* @return Flag indicating if this extractor can run.
*/
@Override
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
/** /**
* Determines how the extraction process will proceed given the settings * Determines how the extraction process will proceed given the settings
* stored in this context instance. * stored in this context instance.

View File

@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader; import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
/** /**
* Extracts text from SQLite database files. * Extracts text from SQLite database files.
@ -39,48 +38,10 @@ import org.sleuthkit.datamodel.Content;
* 2) Tables that contain spaces in their name are not extracted * 2) Tables that contain spaces in their name are not extracted
* 3) Table names are not included in its output text * 3) Table names are not included in its output text
*/ */
final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<T> { final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
private static boolean isDisabled;
static {
try {
Class.forName("org.sqlite.JDBC");
isDisabled = false;
} catch (ClassNotFoundException ex) {
logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
+ "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
isDisabled = true;
}
}
/**
* This extractor only works for sqlite files, so it is indeed content type
* specific.
*
* @return true
*/
@Override
public boolean isContentTypeSpecific() {
return true;
}
/**
* Determines if this extractor is fit to run.
*
* @return Flag indicating if it should or shouldn't be run.
*/
@Override
public boolean isDisabled() {
return isDisabled;
}
@Override
public void logWarning(String msg, Exception exception) {
logger.log(Level.WARNING, msg, exception); //NON-NLS
}
/** /**
* Supports only the sqlite mimetypes * Supports only the sqlite mimetypes
@ -91,7 +52,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
* @return true if x-sqlite3 * @return true if x-sqlite3
*/ */
@Override @Override
public boolean isSupported(Content file, String detectedFormat) { public boolean isSupported(AbstractFile file, String detectedFormat) {
return SQLITE_MIMETYPE.equals(detectedFormat); return SQLITE_MIMETYPE.equals(detectedFormat);
} }
@ -105,12 +66,8 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
* @throws TextExtractorException * @throws TextExtractorException
*/ */
@Override @Override
public Reader getReader(Content source) throws TextExtractorException { public Reader getReader(AbstractFile source) throws InitReaderException {
if(source instanceof AbstractFile) { return new SQLiteStreamReader(source);
return new SQLiteStreamReader((AbstractFile)source);
}
throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type"
+ " AbstractFile.", source.getName(), source.getId()));
} }
/** /**
@ -125,7 +82,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
@Override @Override
public void setExtractionSettings(ExtractionContext context) { public void setExtractionSettings(ExtractionContext context) {
} }
/** /**
* Produces a continuous stream of characters from a database file. To * Produces a continuous stream of characters from a database file. To
* achieve this, all table names are queues up and a SQLiteTableReader is * achieve this, all table names are queues up and a SQLiteTableReader is

View File

@ -25,7 +25,6 @@ import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@ -37,41 +36,12 @@ import org.sleuthkit.datamodel.TskException;
/** /**
* Extracts raw strings from content. * Extracts raw strings from content.
*/ */
final class StringsTextExtractor<T extends Content> extends ContentTextExtractor<T> { final class StringsTextExtractor {
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
private boolean extractUTF8; private boolean extractUTF8;
private boolean extractUTF16; private boolean extractUTF16;
private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8"; private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
/**
* Determines if this extractor may only read particular types of content.
*
* Since Strings may be run on any content type, it is not content specific.
*
* @return false
*/
@Override
public boolean isContentTypeSpecific() {
return false;
}
/**
* Determines if this extractor can read the content type.
*
* Note: Strings can be run on any type of content, so all types will return
* true.
*
* @param file Content source to read
* @param detectedFormat Mimetype of source file.
*
* @return true
*/
@Override
public boolean isSupported(Content file, String detectedFormat) {
return true;
}
private final List<SCRIPT> extractScripts = new ArrayList<>(); private final List<SCRIPT> extractScripts = new ArrayList<>();
/** /**
@ -99,33 +69,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
this.extractScripts.addAll(extractScripts); this.extractScripts.addAll(extractScripts);
} }
/**
* Get the currently used scripts for extraction
*
* @return scripts currently used or null if not supported
*/
public List<SCRIPT> getScripts() {
return new ArrayList<>(extractScripts);
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
/**
* Determines if this extractor should be run or not.
*
* Atleast one of the extraction encodings in DefaultExtractionConfig must
* be set for this extractor to run.
*
* @return Flag indicating if this extractor should be run.
*/
@Override
public boolean isDisabled() {
return extractUTF8 == false && extractUTF16 == false;
}
/** /**
* Returns a reader that will iterate over the text of the content source. * Returns a reader that will iterate over the text of the content source.
* *
@ -136,8 +79,7 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
* @throws * @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/ */
@Override public InputStreamReader getReader(Content content) {
public InputStreamReader getReader(Content content) throws TextExtractorException {
InputStream stringStream = getInputStream(content); InputStream stringStream = getInputStream(content);
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET)); return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
} }
@ -160,7 +102,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
* *
* @param context Instance containing config classes * @param context Instance containing config classes
*/ */
@Override
public void setExtractionSettings(ExtractionContext context) { public void setExtractionSettings(ExtractionContext context) {
if (context != null && context.contains(DefaultExtractionConfig.class)) { if (context != null && context.contains(DefaultExtractionConfig.class)) {
DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class); DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);

View File

@ -19,7 +19,6 @@
package org.sleuthkit.autopsy.textextractors; package org.sleuthkit.autopsy.textextractors;
import java.io.Reader; import java.io.Reader;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
/** /**
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader. * Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
@ -28,23 +27,19 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
* @param <T> The subtype of SleuthkitVisitableItem an implementation is able to * @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
* process. * process.
*/ */
public interface TextExtractor<T extends SleuthkitVisitableItem> { interface TextExtractor<T> {
/** /**
* Is this extractor configured such that no extraction will/should be done? * Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
* *
* @return True if this extractor will/should not perform any extraction. * @param file to test if its content should be supported
*/ * @param detectedFormat mime-type with detected format (such as text/plain)
boolean isDisabled(); * or null if not detected
/**
* Log the given message and exception as a warning.
* *
* @param msg Log message * @return true if the file content is supported, false otherwise
* @param ex Exception associated with the incoming message
*/ */
void logWarning(String msg, Exception ex); public abstract boolean isSupported(T file, String detectedFormat);
/** /**
* Get a reader that will iterate over the text extracted from the given * Get a reader that will iterate over the text extracted from the given
* source. * source.
@ -53,28 +48,8 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
* *
* @return Reader instance that contains the text of the source * @return Reader instance that contains the text of the source
* *
* @throws TextExtractorException
*/ */
Reader getReader(T source) throws TextExtractorException; Reader getReader(T source) throws InitReaderException;
/**
* Get the 'object' id of the given source.
*
* @param source Source content of type T
*
* @return Object id of the source content
*/
long getID(T source);
/**
* Get a human readable name for the given source.
*
* @param source Source content of type T
*
* @return Name of the content source
*/
String getName(T source);
/** /**
* Determines how the extraction process will proceed given the settings * Determines how the extraction process will proceed given the settings
@ -85,18 +60,18 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
* @param context Instance containing file config classes * @param context Instance containing file config classes
*/ */
void setExtractionSettings(ExtractionContext context); void setExtractionSettings(ExtractionContext context);
/** public class InitReaderException extends Exception {
* System exception for dealing with errors encountered during extraction. public InitReaderException(String msg, Throwable ex) {
*/ super(msg, ex);
class TextExtractorException extends Exception {
public TextExtractorException(String message) {
super(message);
} }
public TextExtractorException(String message, Throwable cause) { public InitReaderException(Throwable ex) {
super(message, cause); super(ex);
}
public InitReaderException(String msg) {
super(msg);
} }
} }
} }

View File

@ -18,6 +18,7 @@
*/ */
package org.sleuthkit.autopsy.textextractors; package org.sleuthkit.autopsy.textextractors;
import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
@ -31,8 +32,13 @@ import org.sleuthkit.datamodel.Report;
* See ContentTextExtractor interface for the generic structure of such * See ContentTextExtractor interface for the generic structure of such
* extractors. * extractors.
*/ */
public class TextExtractorFactory { public class TextReader {
private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
new HtmlTextExtractor<>(),
new SqliteTextExtractor<>(),
new TikaTextExtractor<>()
);
/** /**
* Auto detects the correct text extractor given the file. * Auto detects the correct text extractor given the file.
* *
@ -41,40 +47,42 @@ public class TextExtractorFactory {
* will keep the extractors at default settings. Refer to the * will keep the extractors at default settings. Refer to the
* extractionconfigs package for available file configurations. * extractionconfigs package for available file configurations.
* *
* @param <T> Type of source content
* @param file Content source that will be read from * @param file Content source that will be read from
* @param context Contains extraction configurations for certain file types * @param context Contains extraction configurations for certain file types
* *
* @return A ContentTextExtractor instance that is properly configured and * @return A ContentTextExtractor instance that is properly configured and
* can be read from the getReader() method. * can be read from the getReader() method.
* *
* @throws NoContentSpecificExtractorException In the event that the * @throws NoReaderFoundException In the event that the
* inputted file and mimetype * inputted file and mimetype
* have no corresponding * have no corresponding
* extractor * extractor
*/ */
public static <T extends Content> ContentTextExtractor<T> getContentSpecificExtractor(T file, public static Reader getContentSpecificReader(Content file,
ExtractionContext context) throws NoContentSpecificExtractorException { ExtractionContext context) throws NoReaderFoundException {
if (file instanceof AbstractFile) { try {
List<ContentTextExtractor<T>> fileExtractors = getAbstractFileExtractors(); if (file instanceof AbstractFile) {
String mimeType = ((AbstractFile) file).getMIMEType(); String mimeType = ((AbstractFile) file).getMIMEType();
for (ContentTextExtractor<T> candidate : fileExtractors) { for (TextExtractor<AbstractFile> candidate : fileExtractors) {
candidate.setExtractionSettings(context); candidate.setExtractionSettings(context);
if (candidate.isSupported(file, mimeType)) { if (candidate.isSupported((AbstractFile)file, mimeType)) {
return candidate; return candidate.getReader((AbstractFile)file);
}
} }
} else if (file instanceof BlackboardArtifact) {
TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor<>();
artifactExtractor.setExtractionSettings(context);
return artifactExtractor.getReader((BlackboardArtifact)file);
} else if (file instanceof Report) {
TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
reportExtractor.setExtractionSettings(context);
reportExtractor.getReader((Report)file);
} }
} else if (file instanceof BlackboardArtifact) { } catch (TextExtractor.InitReaderException ex) {
ContentTextExtractor<T> artifactExtractor = new ArtifactTextExtractor<>(); throw new NoReaderFoundException(ex);
artifactExtractor.setExtractionSettings(context);
return artifactExtractor;
} else if (file instanceof Report) {
ContentTextExtractor<T> reportExtractor = new TikaTextExtractor<>();
reportExtractor.setExtractionSettings(context);
return reportExtractor;
} }
throw new NoContentSpecificExtractorException( throw new NoReaderFoundException(
String.format("Could not find a suitable extractor for " String.format("Could not find a suitable extractor for "
+ "file with name [%s] and id=[%d]. Try using the default, " + "file with name [%s] and id=[%d]. Try using the default, "
+ "non content specific extractor as an alternative.", + "non content specific extractor as an alternative.",
@ -82,43 +90,34 @@ public class TextExtractorFactory {
); );
} }
/**
* Instantiates and returns a list of all of the known abstract file
* extractors.
*
* @return A list of specialized ContentTextExtractors
*/
private static <T extends Content> List<ContentTextExtractor<T>> getAbstractFileExtractors() {
return Arrays.asList(
new HtmlTextExtractor<>(),
new SqliteTextExtractor<>(),
new TikaTextExtractor<>()
);
}
/** /**
* Returns the default extractor that can be run on any content type. This * Returns the default extractor that can be run on any content type. This
* extractor should be used as a backup in the event that no specialized * extractor should be used as a backup in the event that no specialized
* extractor can be found. * extractor can be found.
* *
* @param source
* @param context Contains extraction configurations for certain file types * @param context Contains extraction configurations for certain file types
* *
* @return A DefaultExtractor instance * @return A DefaultExtractor instance
*/ */
public static ContentTextExtractor<Content> getDefaultExtractor(ExtractionContext context) { public static Reader getDefaultReader(Content source, ExtractionContext context) {
ContentTextExtractor<Content> stringsInstance = new StringsTextExtractor<>(); StringsTextExtractor stringsInstance = new StringsTextExtractor();
stringsInstance.setExtractionSettings(context); stringsInstance.setExtractionSettings(context);
return stringsInstance; return stringsInstance.getReader(source);
} }
/** /**
* System level exception for handling content types that have no specific * System level exception for handling content types that have no specific
* strategy defined for extracting their text. * strategy defined for extracting their text.
*/ */
public static class NoContentSpecificExtractorException extends Exception { public static class NoReaderFoundException extends Exception {
public NoContentSpecificExtractorException(String msg) { public NoReaderFoundException(String msg) {
super(msg); super(msg);
} }
public NoReaderFoundException(Throwable ex) {
super(ex);
}
} }
} }

View File

@ -18,6 +18,7 @@
*/ */
package org.sleuthkit.autopsy.textextractors; package org.sleuthkit.autopsy.textextractors;
import com.google.common.collect.ImmutableList;
import com.google.common.io.CharSource; import com.google.common.io.CharSource;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -54,7 +55,53 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
* Extracts text from Tika supported content. Protects against Tika * Extracts text from Tika supported content. Protects against Tika
* parser hangs (for unexpected/corrupt content) using a timeout mechanism. * parser hangs (for unexpected/corrupt content) using a timeout mechanism.
*/ */
final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T> { final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
//Mimetype groups to aassist extractor implementations in ignoring binary and
//archive files.
private static final List<String> BINARY_MIME_TYPES
= ImmutableList.of(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
private static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
@ -74,11 +121,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
.map(mt -> mt.getType() + "/" + mt.getSubtype()) .map(mt -> mt.getType() + "/" + mt.getSubtype())
.collect(Collectors.toList()); .collect(Collectors.toList());
@Override
public void logWarning(final String msg, Exception ex) {
tikaLogger.log(Level.WARNING, msg, ex);
}
/** /**
* Returns a reader that will iterate over the text extracted from Apache * Returns a reader that will iterate over the text extracted from Apache
* Tika. * Tika.
@ -89,7 +131,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
* @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException * @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/ */
@Override @Override
public Reader getReader(Content content) throws TextExtractorException { public Reader getReader(Content content) throws InitReaderException {
ReadContentInputStream stream = new ReadContentInputStream(content); ReadContentInputStream stream = new ReadContentInputStream(content);
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
@ -136,7 +178,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
PushbackReader pushbackReader = new PushbackReader(tikaReader); PushbackReader pushbackReader = new PushbackReader(tikaReader);
int read = pushbackReader.read(); int read = pushbackReader.read();
if (read == -1) { if (read == -1) {
throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content); throw new InitReaderException("Unable to extract text: Tika returned empty reader for " + content);
} }
pushbackReader.unread(read); pushbackReader.unread(read);
@ -145,15 +187,13 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream(); return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
} catch (TimeoutException te) { } catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName()); final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
logWarning(msg, te); throw new InitReaderException(msg, te);
throw new TextExtractorException(msg, te); } catch (InitReaderException ex) {
} catch (TextExtractorException ex) {
throw ex; throw ex;
} catch (Exception ex) { } catch (Exception ex) {
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName()); final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
logWarning(msg, ex); throw new InitReaderException(msg, ex);
throw new TextExtractorException(msg, ex);
} finally { } finally {
future.cancel(true); future.cancel(true);
} }
@ -199,19 +239,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
)); ));
} }
/**
* Determines if this extractor only understands a specifc type of content.
*
* Although Apache Tika is defined for many input types, it is still a content
* specific approach to extraction.
*
* @return true
*/
@Override
public boolean isContentTypeSpecific() {
return true;
}
/** /**
* Determines if Tika is supported for this content type and mimetype. * Determines if Tika is supported for this content type and mimetype.
* *
@ -222,8 +249,8 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
@Override @Override
public boolean isSupported(Content content, String detectedFormat) { public boolean isSupported(Content content, String detectedFormat) {
if (detectedFormat == null if (detectedFormat == null
|| ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used) || BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|| ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat) || ARCHIVE_MIME_TYPES.contains(detectedFormat)
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
) { ) {
@ -232,19 +259,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
return TIKA_SUPPORTED_TYPES.contains(detectedFormat); return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
} }
/**
* Determines if this extractor can be run.
*
* So long as Tika's dependencies are present, this extractor can run
* no matter the circumstance.
*
* @return true
*/
@Override
public boolean isDisabled() {
return false;
}
/** /**
* Return timeout that should be used to index the content. * Return timeout that should be used to index the content.
* *

View File

@ -19,6 +19,7 @@
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.Reader;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.logging.Level; import java.util.logging.Level;
@ -32,7 +33,6 @@ import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
import org.sleuthkit.autopsy.healthmonitor.TimingMetric; import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
@ -106,8 +106,8 @@ class Ingester {
* @throws IngesterException if there was an error processing a specific * @throws IngesterException if there was an error processing a specific
* artifact, but the Solr server is probably fine. * artifact, but the Solr server is probably fine.
*/ */
void indexMetaDataOnly(BlackboardArtifact artifact, TextExtractor<Content> extractor) throws IngesterException { void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
indexChunk("", extractor.getName(artifact), getContentFields(artifact)); indexChunk("", sourceName, getContentFields(artifact));
} }
/** /**
@ -142,23 +142,12 @@ class Ingester {
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/ */
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException { < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
final long sourceID = extractor.getID(source);
final String sourceName = extractor.getName(source);
int numChunks = 0; //unknown until chunking is done int numChunks = 0; //unknown until chunking is done
if (extractor.isDisabled()) {
/*
* some Extractors, notable the strings extractor, have options
* which can be configured such that no extraction should be done
*/
return true;
}
Map<String, String> fields = getContentFields(source); Map<String, String> fields = getContentFields(source);
//Get a reader for the content of the given source //Get a reader for the content of the given source
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) { try (BufferedReader reader = new BufferedReader(sourceReader)) {
Chunker chunker = new Chunker(reader); Chunker chunker = new Chunker(reader);
for (Chunk chunk : chunker) { for (Chunk chunk : chunker) {
if (context != null && context.fileIngestIsCancelled()) { if (context != null && context.fileIngestIsCancelled()) {
@ -173,18 +162,18 @@ class Ingester {
indexChunk(chunk.toString(), sourceName, fields); indexChunk(chunk.toString(), sourceName, fields);
numChunks++; numChunks++;
} catch (Ingester.IngesterException ingEx) { } catch (Ingester.IngesterException ingEx) {
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
throw ingEx; //need to rethrow to signal error and move on throw ingEx; //need to rethrow to signal error and move on
} }
} }
if (chunker.hasException()) { if (chunker.hasException()) {
extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException()); logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
return false; return false;
} }
} catch (Exception ex) { } catch (Exception ex) {
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false; return false;
} finally { } finally {
if (context != null && context.fileIngestIsCancelled()) { if (context != null && context.fileIngestIsCancelled()) {

View File

@ -18,12 +18,14 @@
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor; import com.google.common.collect.ImmutableList;
import java.io.Reader;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level; import java.util.logging.Level;
import org.openide.util.Exceptions;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages; import org.openide.util.NbBundle.Messages;
import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.Case;
@ -37,16 +39,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter; import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import org.sleuthkit.autopsy.ingest.IngestServices; import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.ExtractionContext; import org.sleuthkit.autopsy.textextractors.ExtractionContext;
import org.sleuthkit.autopsy.textextractors.TextExtractor; import org.sleuthkit.autopsy.textextractors.TextReader;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig; import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig; import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskData; import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown; import org.sleuthkit.datamodel.TskData.FileKnown;
@ -68,6 +69,43 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
}) })
public final class KeywordSearchIngestModule implements FileIngestModule { public final class KeywordSearchIngestModule implements FileIngestModule {
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
public static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/** /**
* Options for this extractor * Options for this extractor
*/ */
@ -104,7 +142,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//accessed read-only by searcher thread //accessed read-only by searcher thread
private boolean startedSearching = false; private boolean startedSearching = false;
private TextExtractor<Content> stringExtractor; private ExtractionContext stringsExtractionContext;
private final KeywordSearchJobSettings settings; private final KeywordSearchJobSettings settings;
private boolean initialized = false; private boolean initialized = false;
private long jobId; private long jobId;
@ -250,7 +288,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
} }
} }
ExtractionContext extractionContext = new ExtractionContext(); stringsExtractionContext = new ExtractionContext();
DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig(); DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions(); Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
@ -258,9 +296,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString()))); stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts()); stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
extractionContext.set(DefaultExtractionConfig.class, stringsConfig); stringsExtractionContext.set(DefaultExtractionConfig.class, stringsConfig);
stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
indexer = new Indexer(); indexer = new Indexer();
initialized = true; initialized = true;
} }
@ -352,7 +389,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* Common cleanup code when module stops or final searcher completes * Common cleanup code when module stops or final searcher completes
*/ */
private void cleanup() { private void cleanup() {
stringExtractor = null; stringsExtractionContext = null;
initialized = false; initialized = false;
} }
@ -440,7 +477,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* @throws IngesterException exception thrown if indexing failed * @throws IngesterException exception thrown if indexing failed
*/ */
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException { private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
TextExtractor<Content> extractor = null;
ExtractionContext extractionContext = new ExtractionContext(); ExtractionContext extractionContext = new ExtractionContext();
ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig(); ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
@ -448,10 +484,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
extractionContext.set(ImageFileExtractionConfig.class, imageConfig); extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
try { try {
extractor = TextExtractorFactory.getContentSpecificExtractor(aFile,extractionContext); Reader specializedReader = TextReader.getContentSpecificReader(aFile,extractionContext);
//divide into chunks and index //divide into chunks and index
return Ingester.getDefault().indexText(extractor, aFile, context); return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
} catch (TextExtractorFactory.NoContentSpecificExtractorException ex) { } catch (TextReader.NoReaderFoundException ex) {
//No text extractor found... run the default instead //No text extractor found... run the default instead
return false; return false;
} }
@ -470,7 +506,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) { if (context.fileIngestIsCancelled()) {
return true; return true;
} }
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) { Reader stringsReader = TextReader.getDefaultReader(aFile, stringsExtractionContext);
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED); putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
return true; return true;
} else { } else {
@ -530,7 +567,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
// we skip archive formats that are opened by the archive module. // we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though... // @@@ We could have a check here to see if the archive module was enabled though...
if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) { if (ARCHIVE_MIME_TYPES.contains(fileType)) {
try { try {
if (context.fileIngestIsCancelled()) { if (context.fileIngestIsCancelled()) {
return; return;
@ -579,11 +616,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//should be ignored by the TextFileExtractor because they may contain more than one text encoding //should be ignored by the TextFileExtractor because they may contain more than one text encoding
try { try {
TextFileExtractor textFileExtractor = new TextFileExtractor(); TextFileExtractor textFileExtractor = new TextFileExtractor();
if (Ingester.getDefault().indexText(textFileExtractor, aFile, context)) { Reader textReader = textFileExtractor.getReader(aFile);
if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED); putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
wasTextAdded = true; wasTextAdded = true;
} }
} catch (IngesterException ex) { } catch (IngesterException | TextFileExtractorException ex) {
logger.log(Level.WARNING, "Unable to index as unicode", ex); logger.log(Level.WARNING, "Unable to index as unicode", ex);
} }
} }

View File

@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
import java.net.InetAddress; import java.net.InetAddress;
import java.util.ArrayList; import java.util.ArrayList;
@ -33,7 +34,6 @@ import org.apache.commons.lang.math.NumberUtils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.openide.util.Exceptions;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.lookup.ServiceProvider; import org.openide.util.lookup.ServiceProvider;
import org.openide.util.lookup.ServiceProviders; import org.openide.util.lookup.ServiceProviders;
@ -46,8 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
import org.sleuthkit.autopsy.progress.ProgressIndicator; import org.sleuthkit.autopsy.progress.ProgressIndicator;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.textextractors.TextExtractor; import org.sleuthkit.autopsy.textextractors.TextReader;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
@ -115,22 +114,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
return; return;
} }
try { try {
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory Reader blackboardReader = TextReader
.getContentSpecificExtractor(content, null); .getContentSpecificReader(content, null);
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor); String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
ingester.indexText(contentSpecificExtractor, artifact, null); ingester.indexMetaDataOnly(artifact, sourceName);
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) { ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }
} else { } else {
try { try {
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory Reader contentReader = TextReader
.getContentSpecificExtractor(content, null); .getContentSpecificReader(content, null);
ingester.indexText(contentSpecificExtractor, content, null); ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
} catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) { } catch (TextReader.NoReaderFoundException | Ingester.IngesterException ex) {
try { try {
// Try the StringsTextExtractor if Tika extractions fails. // Try the StringsTextExtractor if Tika extractions fails.
ingester.indexText(TextExtractorFactory.getDefaultExtractor(null), content, null); ingester.indexText(TextReader.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
} catch (Ingester.IngesterException ex1) { } catch (Ingester.IngesterException ex1) {
throw new TskCoreException(ex.getCause().getMessage(), ex1); throw new TskCoreException(ex.getCause().getMessage(), ex1);
} }
@ -444,11 +444,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
final Ingester ingester = Ingester.getDefault(); final Ingester ingester = Ingester.getDefault();
try { try {
TextExtractor<Content> contentSpecificExtractor = String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
TextExtractorFactory.getContentSpecificExtractor((Content) artifact, null); Reader contentSpecificReader =
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor); TextReader.getContentSpecificReader((Content) artifact, null);
ingester.indexText(contentSpecificExtractor, artifact, null); ingester.indexMetaDataOnly(artifact, sourceName);
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) { ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }
} }

View File

@ -21,19 +21,15 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.Reader; import java.io.Reader;
import java.util.logging.Level;
import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
/** /**
* Extract text from .txt files * Extract text from .txt files
*/ */
final class TextFileExtractor extends ContentTextExtractor<AbstractFile> { final class TextFileExtractor {
//Set a Minimum confidence value to reject matches that may not have a valid text encoding //Set a Minimum confidence value to reject matches that may not have a valid text encoding
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50, //Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
@ -41,47 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
//This limited information was used to select the current value as one that would filter out clearly non-text //This limited information was used to select the current value as one that would filter out clearly non-text
//files while hopefully working on all files with a valid text encoding //files while hopefully working on all files with a valid text encoding
static final private int MIN_MATCH_CONFIDENCE = 20; static final private int MIN_MATCH_CONFIDENCE = 20;
static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
@Override public Reader getReader(AbstractFile source) throws TextFileExtractorException {
public boolean isContentTypeSpecific() {
return true;
}
@Override
public boolean isSupported(AbstractFile file, String detectedFormat) {
return true;
}
@Override
public Reader getReader(AbstractFile source) throws TextExtractorException {
CharsetDetector detector = new CharsetDetector(); CharsetDetector detector = new CharsetDetector();
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source)); InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
try { try {
detector.setText(stream); detector.setText(stream);
} catch (IOException ex) { } catch (IOException ex) {
throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex); throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
} }
CharsetMatch match = detector.detect(); CharsetMatch match = detector.detect();
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) { if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor"); throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
} }
return match.getReader(); return match.getReader();
} }
@Override public class TextFileExtractorException extends Exception {
public boolean isDisabled() { public TextFileExtractorException(String msg, Throwable ex) {
return false; super(msg, ex);
} }
public TextFileExtractorException(String msg) {
@Override super(msg);
public void logWarning(String msg, Exception ex) { }
logger.log(Level.WARNING, msg, ex);
}
@Override
public void setExtractionSettings(ExtractionContext context) {
} }
} }