mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-11 23:46:15 +00:00
Complete overhaul of how extractors are currently implemented, hopefully converging to a more sensible solution
This commit is contained in:
parent
ce548fb978
commit
ece50a3a00
@ -22,7 +22,6 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
@ -35,39 +34,27 @@ import org.sleuthkit.datamodel.TskCoreException;
|
||||
* Extracts text from artifacts by concatenating the values of all of the
|
||||
* artifact's attributes.
|
||||
*/
|
||||
class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
||||
class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
private InputStream getInputStream(Content artifact) throws TextExtractorException {
|
||||
BlackboardArtifact art = (BlackboardArtifact)artifact;
|
||||
|
||||
private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException {
|
||||
// Concatenate the string values of all attributes into a single
|
||||
// "content" string to be indexed.
|
||||
StringBuilder artifactContents = new StringBuilder();
|
||||
|
||||
Content dataSource = null;
|
||||
try {
|
||||
dataSource = art.getDataSource();
|
||||
dataSource = artifact.getDataSource();
|
||||
} catch (TskCoreException tskCoreException) {
|
||||
throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
|
||||
throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
|
||||
}
|
||||
if (dataSource == null) {
|
||||
throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
|
||||
throw new InitReaderException("Datasource was null for artifact: " + artifact.toString());
|
||||
}
|
||||
|
||||
try {
|
||||
for (BlackboardAttribute attribute : art.getAttributes()) {
|
||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||
artifactContents.append(" : ");
|
||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||
@ -85,40 +72,31 @@ class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
||||
artifactContents.append(System.lineSeparator());
|
||||
}
|
||||
} catch (TskCoreException tskCoreException) {
|
||||
throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
|
||||
throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
|
||||
}
|
||||
|
||||
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader(Content source) throws TextExtractorException {
|
||||
public Reader getReader(BlackboardArtifact source) throws InitReaderException {
|
||||
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getID(Content source) {
|
||||
BlackboardArtifact art = (BlackboardArtifact)source;
|
||||
return art.getArtifactID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName(Content source) {
|
||||
BlackboardArtifact art = (BlackboardArtifact)source;
|
||||
return art.getDisplayName() + "_" + art.getArtifactID();
|
||||
}
|
||||
|
||||
/**
|
||||
* Configures this extractors to the settings stored in relevant config instances.
|
||||
*
|
||||
* This operation is a no-op since currently there are no configurable settings
|
||||
* of the extraction process.
|
||||
*
|
||||
* @param context Instance containing file config settings
|
||||
*/
|
||||
@Override
|
||||
public void setExtractionSettings(ExtractionContext context) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported(Content file, String detectedFormat) {
|
||||
public boolean isSupported(BlackboardArtifact file, String detectedFormat) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -1,130 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2018 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
|
||||
/**
|
||||
* Common methods for utilities that extract text and content and divide into
|
||||
* chunks
|
||||
* @param <T>
|
||||
*/
|
||||
public abstract class ContentTextExtractor<T extends Content> implements TextExtractor<T> {
|
||||
|
||||
//Mimetype groups to aassist extractor implementations in ignoring binary and
|
||||
//archive files.
|
||||
public static final List<String> BINARY_MIME_TYPES
|
||||
= ImmutableList.of(
|
||||
//ignore binary blob data, for which string extraction will be used
|
||||
"application/octet-stream", //NON-NLS
|
||||
"application/x-msdownload"); //NON-NLS
|
||||
|
||||
/** generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them */
|
||||
public static final List<String> ARCHIVE_MIME_TYPES
|
||||
= ImmutableList.of(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
"application/x-7z-compressed", //NON-NLS
|
||||
"application/x-ace-compressed", //NON-NLS
|
||||
"application/x-alz-compressed", //NON-NLS
|
||||
"application/x-arj", //NON-NLS
|
||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||
"application/x-cfs-compressed", //NON-NLS
|
||||
"application/x-dgc-compressed", //NON-NLS
|
||||
"application/x-apple-diskimage", //NON-NLS
|
||||
"application/x-gca-compressed", //NON-NLS
|
||||
"application/x-dar", //NON-NLS
|
||||
"application/x-lzx", //NON-NLS
|
||||
"application/x-lzh", //NON-NLS
|
||||
"application/x-rar-compressed", //NON-NLS
|
||||
"application/x-stuffit", //NON-NLS
|
||||
"application/x-stuffitx", //NON-NLS
|
||||
"application/x-gtar", //NON-NLS
|
||||
"application/x-archive", //NON-NLS
|
||||
"application/x-executable", //NON-NLS
|
||||
"application/x-gzip", //NON-NLS
|
||||
"application/zip", //NON-NLS
|
||||
"application/x-zoo", //NON-NLS
|
||||
"application/x-cpio", //NON-NLS
|
||||
"application/x-shar", //NON-NLS
|
||||
"application/x-tar", //NON-NLS
|
||||
"application/x-bzip", //NON-NLS
|
||||
"application/x-bzip2", //NON-NLS
|
||||
"application/x-lzip", //NON-NLS
|
||||
"application/x-lzma", //NON-NLS
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
/**
|
||||
* Determines if the extractor works only for specified types is
|
||||
* supportedTypes() or whether is a generic content extractor (such as
|
||||
* string extractor)
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public abstract boolean isContentTypeSpecific();
|
||||
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor if
|
||||
* isContentTypeSpecific() returns true.
|
||||
*
|
||||
* @param file to test if its content should be supported
|
||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||
* or null if not detected
|
||||
*
|
||||
* @return true if the file content is supported, false otherwise
|
||||
*/
|
||||
public abstract boolean isSupported(T file, String detectedFormat);
|
||||
|
||||
/**
|
||||
* Returns a reader that will iterate over the text of the source content.
|
||||
*
|
||||
* @param source Content source to read
|
||||
* @return A reader that contains all source text
|
||||
* @throws TextExtractorException Error encountered during extraction
|
||||
*/
|
||||
@Override
|
||||
public abstract Reader getReader(T source) throws TextExtractorException;
|
||||
|
||||
/**
|
||||
* Get the object id of the content source.
|
||||
*
|
||||
* @param source source content
|
||||
* @return object id associated with this source content
|
||||
*/
|
||||
@Override
|
||||
public long getID(T source) {
|
||||
return source.getId();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the human-readable name of the given content source.
|
||||
*
|
||||
* @param source source content
|
||||
* @return name of source content
|
||||
*/
|
||||
@Override
|
||||
public String getName(T source) {
|
||||
return source.getName();
|
||||
}
|
||||
}
|
@ -23,7 +23,6 @@ import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.logging.Level;
|
||||
import net.htmlparser.jericho.Attributes;
|
||||
import net.htmlparser.jericho.Config;
|
||||
import net.htmlparser.jericho.LoggerProvider;
|
||||
@ -33,13 +32,12 @@ import net.htmlparser.jericho.StartTag;
|
||||
import net.htmlparser.jericho.StartTagType;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extracts text from HTML content.
|
||||
*/
|
||||
final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
||||
final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||
private final int MAX_SIZE;
|
||||
@ -67,19 +65,6 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
MAX_SIZE = 50_000_000;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this extractor is responsible for extracting only a
|
||||
* specific type of media.
|
||||
*
|
||||
* In this case, only HTML documents can be read successfully.
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this content type is supported by this extractor.
|
||||
*
|
||||
@ -89,7 +74,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
* @return flag indicating support
|
||||
*/
|
||||
@Override
|
||||
public boolean isSupported(Content content, String detectedFormat) {
|
||||
public boolean isSupported(AbstractFile content, String detectedFormat) {
|
||||
return detectedFormat != null
|
||||
&& WEB_MIME_TYPES.contains(detectedFormat)
|
||||
&& content.getSize() <= MAX_SIZE;
|
||||
@ -105,7 +90,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
* @throws TextExtractorException
|
||||
*/
|
||||
@Override
|
||||
public Reader getReader(Content content) throws TextExtractorException {
|
||||
public Reader getReader(AbstractFile content) throws InitReaderException {
|
||||
//TODO JIRA-4467, there is only harm in excluding HTML documents greater
|
||||
//than 50MB due to our troubled approach of extraction.
|
||||
ReadContentInputStream stream = new ReadContentInputStream(content);
|
||||
@ -201,25 +186,10 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
// All done, now make it a reader
|
||||
return new StringReader(stringBuilder.toString());
|
||||
} catch (IOException ex) {
|
||||
throw new TextExtractorException("Error extracting HTML from content.", ex);
|
||||
throw new InitReaderException("Error extracting HTML from content.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates if this extractor can run.
|
||||
*
|
||||
* @return Flag indicating if this extractor can run.
|
||||
*/
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines how the extraction process will proceed given the settings
|
||||
* stored in this context instance.
|
||||
|
@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
|
||||
/**
|
||||
* Extracts text from SQLite database files.
|
||||
@ -39,48 +38,10 @@ import org.sleuthkit.datamodel.Content;
|
||||
* 2) Tables that contain spaces in their name are not extracted
|
||||
* 3) Table names are not included in its output text
|
||||
*/
|
||||
final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
||||
final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
|
||||
|
||||
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
||||
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
||||
private static boolean isDisabled;
|
||||
|
||||
static {
|
||||
try {
|
||||
Class.forName("org.sqlite.JDBC");
|
||||
isDisabled = false;
|
||||
} catch (ClassNotFoundException ex) {
|
||||
logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
|
||||
+ "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
|
||||
isDisabled = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This extractor only works for sqlite files, so it is indeed content type
|
||||
* specific.
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this extractor is fit to run.
|
||||
*
|
||||
* @return Flag indicating if it should or shouldn't be run.
|
||||
*/
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
return isDisabled;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void logWarning(String msg, Exception exception) {
|
||||
logger.log(Level.WARNING, msg, exception); //NON-NLS
|
||||
}
|
||||
|
||||
/**
|
||||
* Supports only the sqlite mimetypes
|
||||
@ -91,7 +52,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
|
||||
* @return true if x-sqlite3
|
||||
*/
|
||||
@Override
|
||||
public boolean isSupported(Content file, String detectedFormat) {
|
||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||
return SQLITE_MIMETYPE.equals(detectedFormat);
|
||||
}
|
||||
|
||||
@ -105,12 +66,8 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
|
||||
* @throws TextExtractorException
|
||||
*/
|
||||
@Override
|
||||
public Reader getReader(Content source) throws TextExtractorException {
|
||||
if(source instanceof AbstractFile) {
|
||||
return new SQLiteStreamReader((AbstractFile)source);
|
||||
}
|
||||
throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type"
|
||||
+ " AbstractFile.", source.getName(), source.getId()));
|
||||
public Reader getReader(AbstractFile source) throws InitReaderException {
|
||||
return new SQLiteStreamReader(source);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -25,7 +25,6 @@ import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
@ -37,41 +36,12 @@ import org.sleuthkit.datamodel.TskException;
|
||||
/**
|
||||
* Extracts raw strings from content.
|
||||
*/
|
||||
final class StringsTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
||||
final class StringsTextExtractor {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
||||
private boolean extractUTF8;
|
||||
private boolean extractUTF16;
|
||||
private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
|
||||
|
||||
/**
|
||||
* Determines if this extractor may only read particular types of content.
|
||||
*
|
||||
* Since Strings may be run on any content type, it is not content specific.
|
||||
*
|
||||
* @return false
|
||||
*/
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this extractor can read the content type.
|
||||
*
|
||||
* Note: Strings can be run on any type of content, so all types will return
|
||||
* true.
|
||||
*
|
||||
* @param file Content source to read
|
||||
* @param detectedFormat Mimetype of source file.
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
@Override
|
||||
public boolean isSupported(Content file, String detectedFormat) {
|
||||
return true;
|
||||
}
|
||||
|
||||
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
||||
|
||||
/**
|
||||
@ -99,33 +69,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
|
||||
this.extractScripts.addAll(extractScripts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the currently used scripts for extraction
|
||||
*
|
||||
* @return scripts currently used or null if not supported
|
||||
*/
|
||||
public List<SCRIPT> getScripts() {
|
||||
return new ArrayList<>(extractScripts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this extractor should be run or not.
|
||||
*
|
||||
* Atleast one of the extraction encodings in DefaultExtractionConfig must
|
||||
* be set for this extractor to run.
|
||||
*
|
||||
* @return Flag indicating if this extractor should be run.
|
||||
*/
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
return extractUTF8 == false && extractUTF16 == false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reader that will iterate over the text of the content source.
|
||||
*
|
||||
@ -136,8 +79,7 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
|
||||
* @throws
|
||||
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
||||
*/
|
||||
@Override
|
||||
public InputStreamReader getReader(Content content) throws TextExtractorException {
|
||||
public InputStreamReader getReader(Content content) {
|
||||
InputStream stringStream = getInputStream(content);
|
||||
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
|
||||
}
|
||||
@ -160,7 +102,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
|
||||
*
|
||||
* @param context Instance containing config classes
|
||||
*/
|
||||
@Override
|
||||
public void setExtractionSettings(ExtractionContext context) {
|
||||
if (context != null && context.contains(DefaultExtractionConfig.class)) {
|
||||
DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
|
||||
|
@ -19,7 +19,6 @@
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.io.Reader;
|
||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
|
||||
/**
|
||||
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
|
||||
@ -28,23 +27,19 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
* @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
|
||||
* process.
|
||||
*/
|
||||
public interface TextExtractor<T extends SleuthkitVisitableItem> {
|
||||
interface TextExtractor<T> {
|
||||
|
||||
/**
|
||||
* Is this extractor configured such that no extraction will/should be done?
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor if
|
||||
* isContentTypeSpecific() returns true.
|
||||
*
|
||||
* @return True if this extractor will/should not perform any extraction.
|
||||
*/
|
||||
boolean isDisabled();
|
||||
|
||||
/**
|
||||
* Log the given message and exception as a warning.
|
||||
* @param file to test if its content should be supported
|
||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||
* or null if not detected
|
||||
*
|
||||
* @param msg Log message
|
||||
* @param ex Exception associated with the incoming message
|
||||
* @return true if the file content is supported, false otherwise
|
||||
*/
|
||||
void logWarning(String msg, Exception ex);
|
||||
|
||||
public abstract boolean isSupported(T file, String detectedFormat);
|
||||
/**
|
||||
* Get a reader that will iterate over the text extracted from the given
|
||||
* source.
|
||||
@ -53,28 +48,8 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
|
||||
*
|
||||
* @return Reader instance that contains the text of the source
|
||||
*
|
||||
* @throws TextExtractorException
|
||||
*/
|
||||
Reader getReader(T source) throws TextExtractorException;
|
||||
|
||||
/**
|
||||
* Get the 'object' id of the given source.
|
||||
*
|
||||
* @param source Source content of type T
|
||||
*
|
||||
* @return Object id of the source content
|
||||
*/
|
||||
long getID(T source);
|
||||
|
||||
/**
|
||||
* Get a human readable name for the given source.
|
||||
*
|
||||
* @param source Source content of type T
|
||||
*
|
||||
* @return Name of the content source
|
||||
*/
|
||||
String getName(T source);
|
||||
|
||||
Reader getReader(T source) throws InitReaderException;
|
||||
|
||||
/**
|
||||
* Determines how the extraction process will proceed given the settings
|
||||
@ -86,17 +61,17 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
|
||||
*/
|
||||
void setExtractionSettings(ExtractionContext context);
|
||||
|
||||
/**
|
||||
* System exception for dealing with errors encountered during extraction.
|
||||
*/
|
||||
class TextExtractorException extends Exception {
|
||||
|
||||
public TextExtractorException(String message) {
|
||||
super(message);
|
||||
public class InitReaderException extends Exception {
|
||||
public InitReaderException(String msg, Throwable ex) {
|
||||
super(msg, ex);
|
||||
}
|
||||
|
||||
public TextExtractorException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
public InitReaderException(Throwable ex) {
|
||||
super(ex);
|
||||
}
|
||||
|
||||
public InitReaderException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -18,6 +18,7 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
@ -31,8 +32,13 @@ import org.sleuthkit.datamodel.Report;
|
||||
* See ContentTextExtractor interface for the generic structure of such
|
||||
* extractors.
|
||||
*/
|
||||
public class TextExtractorFactory {
|
||||
public class TextReader {
|
||||
|
||||
private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
|
||||
new HtmlTextExtractor<>(),
|
||||
new SqliteTextExtractor<>(),
|
||||
new TikaTextExtractor<>()
|
||||
);
|
||||
/**
|
||||
* Auto detects the correct text extractor given the file.
|
||||
*
|
||||
@ -41,40 +47,42 @@ public class TextExtractorFactory {
|
||||
* will keep the extractors at default settings. Refer to the
|
||||
* extractionconfigs package for available file configurations.
|
||||
*
|
||||
* @param <T> Type of source content
|
||||
* @param file Content source that will be read from
|
||||
* @param context Contains extraction configurations for certain file types
|
||||
*
|
||||
* @return A ContentTextExtractor instance that is properly configured and
|
||||
* can be read from the getReader() method.
|
||||
*
|
||||
* @throws NoContentSpecificExtractorException In the event that the
|
||||
* @throws NoReaderFoundException In the event that the
|
||||
* inputted file and mimetype
|
||||
* have no corresponding
|
||||
* extractor
|
||||
*/
|
||||
public static <T extends Content> ContentTextExtractor<T> getContentSpecificExtractor(T file,
|
||||
ExtractionContext context) throws NoContentSpecificExtractorException {
|
||||
if (file instanceof AbstractFile) {
|
||||
List<ContentTextExtractor<T>> fileExtractors = getAbstractFileExtractors();
|
||||
String mimeType = ((AbstractFile) file).getMIMEType();
|
||||
for (ContentTextExtractor<T> candidate : fileExtractors) {
|
||||
candidate.setExtractionSettings(context);
|
||||
if (candidate.isSupported(file, mimeType)) {
|
||||
return candidate;
|
||||
public static Reader getContentSpecificReader(Content file,
|
||||
ExtractionContext context) throws NoReaderFoundException {
|
||||
try {
|
||||
if (file instanceof AbstractFile) {
|
||||
String mimeType = ((AbstractFile) file).getMIMEType();
|
||||
for (TextExtractor<AbstractFile> candidate : fileExtractors) {
|
||||
candidate.setExtractionSettings(context);
|
||||
if (candidate.isSupported((AbstractFile)file, mimeType)) {
|
||||
return candidate.getReader((AbstractFile)file);
|
||||
}
|
||||
}
|
||||
} else if (file instanceof BlackboardArtifact) {
|
||||
TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor<>();
|
||||
artifactExtractor.setExtractionSettings(context);
|
||||
return artifactExtractor.getReader((BlackboardArtifact)file);
|
||||
} else if (file instanceof Report) {
|
||||
TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
|
||||
reportExtractor.setExtractionSettings(context);
|
||||
reportExtractor.getReader((Report)file);
|
||||
}
|
||||
} else if (file instanceof BlackboardArtifact) {
|
||||
ContentTextExtractor<T> artifactExtractor = new ArtifactTextExtractor<>();
|
||||
artifactExtractor.setExtractionSettings(context);
|
||||
return artifactExtractor;
|
||||
} else if (file instanceof Report) {
|
||||
ContentTextExtractor<T> reportExtractor = new TikaTextExtractor<>();
|
||||
reportExtractor.setExtractionSettings(context);
|
||||
return reportExtractor;
|
||||
} catch (TextExtractor.InitReaderException ex) {
|
||||
throw new NoReaderFoundException(ex);
|
||||
}
|
||||
|
||||
throw new NoContentSpecificExtractorException(
|
||||
throw new NoReaderFoundException(
|
||||
String.format("Could not find a suitable extractor for "
|
||||
+ "file with name [%s] and id=[%d]. Try using the default, "
|
||||
+ "non content specific extractor as an alternative.",
|
||||
@ -82,43 +90,34 @@ public class TextExtractorFactory {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates and returns a list of all of the known abstract file
|
||||
* extractors.
|
||||
*
|
||||
* @return A list of specialized ContentTextExtractors
|
||||
*/
|
||||
private static <T extends Content> List<ContentTextExtractor<T>> getAbstractFileExtractors() {
|
||||
return Arrays.asList(
|
||||
new HtmlTextExtractor<>(),
|
||||
new SqliteTextExtractor<>(),
|
||||
new TikaTextExtractor<>()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the default extractor that can be run on any content type. This
|
||||
* extractor should be used as a backup in the event that no specialized
|
||||
* extractor can be found.
|
||||
*
|
||||
* @param source
|
||||
* @param context Contains extraction configurations for certain file types
|
||||
*
|
||||
* @return A DefaultExtractor instance
|
||||
*/
|
||||
public static ContentTextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
|
||||
ContentTextExtractor<Content> stringsInstance = new StringsTextExtractor<>();
|
||||
public static Reader getDefaultReader(Content source, ExtractionContext context) {
|
||||
StringsTextExtractor stringsInstance = new StringsTextExtractor();
|
||||
stringsInstance.setExtractionSettings(context);
|
||||
return stringsInstance;
|
||||
return stringsInstance.getReader(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* System level exception for handling content types that have no specific
|
||||
* strategy defined for extracting their text.
|
||||
*/
|
||||
public static class NoContentSpecificExtractorException extends Exception {
|
||||
public static class NoReaderFoundException extends Exception {
|
||||
|
||||
public NoContentSpecificExtractorException(String msg) {
|
||||
public NoReaderFoundException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
public NoReaderFoundException(Throwable ex) {
|
||||
super(ex);
|
||||
}
|
||||
}
|
||||
}
|
@ -18,6 +18,7 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.io.CharSource;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
@ -54,7 +55,53 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
* Extracts text from Tika supported content. Protects against Tika
|
||||
* parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
||||
*/
|
||||
final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
||||
final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
|
||||
|
||||
//Mimetype groups to aassist extractor implementations in ignoring binary and
|
||||
//archive files.
|
||||
private static final List<String> BINARY_MIME_TYPES
|
||||
= ImmutableList.of(
|
||||
//ignore binary blob data, for which string extraction will be used
|
||||
"application/octet-stream", //NON-NLS
|
||||
"application/x-msdownload"); //NON-NLS
|
||||
|
||||
/** generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them */
|
||||
private static final List<String> ARCHIVE_MIME_TYPES
|
||||
= ImmutableList.of(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
"application/x-7z-compressed", //NON-NLS
|
||||
"application/x-ace-compressed", //NON-NLS
|
||||
"application/x-alz-compressed", //NON-NLS
|
||||
"application/x-arj", //NON-NLS
|
||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||
"application/x-cfs-compressed", //NON-NLS
|
||||
"application/x-dgc-compressed", //NON-NLS
|
||||
"application/x-apple-diskimage", //NON-NLS
|
||||
"application/x-gca-compressed", //NON-NLS
|
||||
"application/x-dar", //NON-NLS
|
||||
"application/x-lzx", //NON-NLS
|
||||
"application/x-lzh", //NON-NLS
|
||||
"application/x-rar-compressed", //NON-NLS
|
||||
"application/x-stuffit", //NON-NLS
|
||||
"application/x-stuffitx", //NON-NLS
|
||||
"application/x-gtar", //NON-NLS
|
||||
"application/x-archive", //NON-NLS
|
||||
"application/x-executable", //NON-NLS
|
||||
"application/x-gzip", //NON-NLS
|
||||
"application/zip", //NON-NLS
|
||||
"application/x-zoo", //NON-NLS
|
||||
"application/x-cpio", //NON-NLS
|
||||
"application/x-shar", //NON-NLS
|
||||
"application/x-tar", //NON-NLS
|
||||
"application/x-bzip", //NON-NLS
|
||||
"application/x-bzip2", //NON-NLS
|
||||
"application/x-lzip", //NON-NLS
|
||||
"application/x-lzma", //NON-NLS
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
|
||||
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
||||
|
||||
@ -74,11 +121,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
.map(mt -> mt.getType() + "/" + mt.getSubtype())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@Override
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
tikaLogger.log(Level.WARNING, msg, ex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reader that will iterate over the text extracted from Apache
|
||||
* Tika.
|
||||
@ -89,7 +131,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
* @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
||||
*/
|
||||
@Override
|
||||
public Reader getReader(Content content) throws TextExtractorException {
|
||||
public Reader getReader(Content content) throws InitReaderException {
|
||||
ReadContentInputStream stream = new ReadContentInputStream(content);
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
@ -136,7 +178,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
PushbackReader pushbackReader = new PushbackReader(tikaReader);
|
||||
int read = pushbackReader.read();
|
||||
if (read == -1) {
|
||||
throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
|
||||
throw new InitReaderException("Unable to extract text: Tika returned empty reader for " + content);
|
||||
}
|
||||
pushbackReader.unread(read);
|
||||
|
||||
@ -145,15 +187,13 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
|
||||
} catch (TimeoutException te) {
|
||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
|
||||
logWarning(msg, te);
|
||||
throw new TextExtractorException(msg, te);
|
||||
} catch (TextExtractorException ex) {
|
||||
throw new InitReaderException(msg, te);
|
||||
} catch (InitReaderException ex) {
|
||||
throw ex;
|
||||
} catch (Exception ex) {
|
||||
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
|
||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
|
||||
logWarning(msg, ex);
|
||||
throw new TextExtractorException(msg, ex);
|
||||
throw new InitReaderException(msg, ex);
|
||||
} finally {
|
||||
future.cancel(true);
|
||||
}
|
||||
@ -199,19 +239,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this extractor only understands a specifc type of content.
|
||||
*
|
||||
* Although Apache Tika is defined for many input types, it is still a content
|
||||
* specific approach to extraction.
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if Tika is supported for this content type and mimetype.
|
||||
*
|
||||
@ -222,8 +249,8 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
@Override
|
||||
public boolean isSupported(Content content, String detectedFormat) {
|
||||
if (detectedFormat == null
|
||||
|| ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||
|| ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||
|| BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||
|| ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
|
||||
) {
|
||||
@ -232,19 +259,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this extractor can be run.
|
||||
*
|
||||
* So long as Tika's dependencies are present, this extractor can run
|
||||
* no matter the circumstance.
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return timeout that should be used to index the content.
|
||||
*
|
||||
|
@ -19,6 +19,7 @@
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
@ -32,7 +33,6 @@ import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
|
||||
import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
@ -106,8 +106,8 @@ class Ingester {
|
||||
* @throws IngesterException if there was an error processing a specific
|
||||
* artifact, but the Solr server is probably fine.
|
||||
*/
|
||||
void indexMetaDataOnly(BlackboardArtifact artifact, TextExtractor<Content> extractor) throws IngesterException {
|
||||
indexChunk("", extractor.getName(artifact), getContentFields(artifact));
|
||||
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
||||
indexChunk("", sourceName, getContentFields(artifact));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -142,23 +142,12 @@ class Ingester {
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
|
||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
final long sourceID = extractor.getID(source);
|
||||
final String sourceName = extractor.getName(source);
|
||||
|
||||
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
int numChunks = 0; //unknown until chunking is done
|
||||
|
||||
if (extractor.isDisabled()) {
|
||||
/*
|
||||
* some Extractors, notable the strings extractor, have options
|
||||
* which can be configured such that no extraction should be done
|
||||
*/
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<String, String> fields = getContentFields(source);
|
||||
//Get a reader for the content of the given source
|
||||
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
|
||||
try (BufferedReader reader = new BufferedReader(sourceReader)) {
|
||||
Chunker chunker = new Chunker(reader);
|
||||
for (Chunk chunk : chunker) {
|
||||
if (context != null && context.fileIngestIsCancelled()) {
|
||||
@ -173,18 +162,18 @@ class Ingester {
|
||||
indexChunk(chunk.toString(), sourceName, fields);
|
||||
numChunks++;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
|
||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||
|
||||
throw ingEx; //need to rethrow to signal error and move on
|
||||
}
|
||||
}
|
||||
if (chunker.hasException()) {
|
||||
extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
|
||||
logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
|
||||
return false;
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} finally {
|
||||
if (context != null && context.fileIngestIsCancelled()) {
|
||||
|
@ -18,12 +18,14 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.logging.Level;
|
||||
import org.openide.util.Exceptions;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.NbBundle.Messages;
|
||||
import org.sleuthkit.autopsy.casemodule.Case;
|
||||
@ -37,16 +39,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
|
||||
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
||||
import org.sleuthkit.autopsy.ingest.IngestServices;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||
import org.sleuthkit.autopsy.textextractors.TextReader;
|
||||
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
|
||||
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.TskData;
|
||||
import org.sleuthkit.datamodel.TskData.FileKnown;
|
||||
|
||||
@ -68,6 +69,43 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
|
||||
})
|
||||
public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
|
||||
/** generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them */
|
||||
public static final List<String> ARCHIVE_MIME_TYPES
|
||||
= ImmutableList.of(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
"application/x-7z-compressed", //NON-NLS
|
||||
"application/x-ace-compressed", //NON-NLS
|
||||
"application/x-alz-compressed", //NON-NLS
|
||||
"application/x-arj", //NON-NLS
|
||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||
"application/x-cfs-compressed", //NON-NLS
|
||||
"application/x-dgc-compressed", //NON-NLS
|
||||
"application/x-apple-diskimage", //NON-NLS
|
||||
"application/x-gca-compressed", //NON-NLS
|
||||
"application/x-dar", //NON-NLS
|
||||
"application/x-lzx", //NON-NLS
|
||||
"application/x-lzh", //NON-NLS
|
||||
"application/x-rar-compressed", //NON-NLS
|
||||
"application/x-stuffit", //NON-NLS
|
||||
"application/x-stuffitx", //NON-NLS
|
||||
"application/x-gtar", //NON-NLS
|
||||
"application/x-archive", //NON-NLS
|
||||
"application/x-executable", //NON-NLS
|
||||
"application/x-gzip", //NON-NLS
|
||||
"application/zip", //NON-NLS
|
||||
"application/x-zoo", //NON-NLS
|
||||
"application/x-cpio", //NON-NLS
|
||||
"application/x-shar", //NON-NLS
|
||||
"application/x-tar", //NON-NLS
|
||||
"application/x-bzip", //NON-NLS
|
||||
"application/x-bzip2", //NON-NLS
|
||||
"application/x-lzip", //NON-NLS
|
||||
"application/x-lzma", //NON-NLS
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
/**
|
||||
* Options for this extractor
|
||||
*/
|
||||
@ -104,7 +142,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
//accessed read-only by searcher thread
|
||||
|
||||
private boolean startedSearching = false;
|
||||
private TextExtractor<Content> stringExtractor;
|
||||
private ExtractionContext stringsExtractionContext;
|
||||
private final KeywordSearchJobSettings settings;
|
||||
private boolean initialized = false;
|
||||
private long jobId;
|
||||
@ -250,7 +288,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
}
|
||||
}
|
||||
|
||||
ExtractionContext extractionContext = new ExtractionContext();
|
||||
stringsExtractionContext = new ExtractionContext();
|
||||
|
||||
DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
|
||||
Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
|
||||
@ -258,9 +296,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
|
||||
stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
|
||||
|
||||
extractionContext.set(DefaultExtractionConfig.class, stringsConfig);
|
||||
stringsExtractionContext.set(DefaultExtractionConfig.class, stringsConfig);
|
||||
|
||||
stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
|
||||
indexer = new Indexer();
|
||||
initialized = true;
|
||||
}
|
||||
@ -352,7 +389,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
* Common cleanup code when module stops or final searcher completes
|
||||
*/
|
||||
private void cleanup() {
|
||||
stringExtractor = null;
|
||||
stringsExtractionContext = null;
|
||||
initialized = false;
|
||||
}
|
||||
|
||||
@ -440,7 +477,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
* @throws IngesterException exception thrown if indexing failed
|
||||
*/
|
||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||
TextExtractor<Content> extractor = null;
|
||||
ExtractionContext extractionContext = new ExtractionContext();
|
||||
|
||||
ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
|
||||
@ -448,10 +484,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
|
||||
|
||||
try {
|
||||
extractor = TextExtractorFactory.getContentSpecificExtractor(aFile,extractionContext);
|
||||
Reader specializedReader = TextReader.getContentSpecificReader(aFile,extractionContext);
|
||||
//divide into chunks and index
|
||||
return Ingester.getDefault().indexText(extractor, aFile, context);
|
||||
} catch (TextExtractorFactory.NoContentSpecificExtractorException ex) {
|
||||
return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
|
||||
} catch (TextReader.NoReaderFoundException ex) {
|
||||
//No text extractor found... run the default instead
|
||||
return false;
|
||||
}
|
||||
@ -470,7 +506,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return true;
|
||||
}
|
||||
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
|
||||
Reader stringsReader = TextReader.getDefaultReader(aFile, stringsExtractionContext);
|
||||
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
||||
return true;
|
||||
} else {
|
||||
@ -530,7 +567,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
|
||||
// we skip archive formats that are opened by the archive module.
|
||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||
if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||
if (ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||
try {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return;
|
||||
@ -579,11 +616,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
||||
try {
|
||||
TextFileExtractor textFileExtractor = new TextFileExtractor();
|
||||
if (Ingester.getDefault().indexText(textFileExtractor, aFile, context)) {
|
||||
Reader textReader = textFileExtractor.getReader(aFile);
|
||||
if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
||||
wasTextAdded = true;
|
||||
}
|
||||
} catch (IngesterException ex) {
|
||||
} catch (IngesterException | TextFileExtractorException ex) {
|
||||
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.net.InetAddress;
|
||||
import java.util.ArrayList;
|
||||
@ -33,7 +34,6 @@ import org.apache.commons.lang.math.NumberUtils;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||
import org.openide.util.Exceptions;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.lookup.ServiceProvider;
|
||||
import org.openide.util.lookup.ServiceProviders;
|
||||
@ -46,8 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
|
||||
import org.sleuthkit.autopsy.progress.ProgressIndicator;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||
import org.sleuthkit.autopsy.textextractors.TextReader;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
@ -115,22 +114,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
|
||||
.getContentSpecificExtractor(content, null);
|
||||
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
|
||||
ingester.indexText(contentSpecificExtractor, artifact, null);
|
||||
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
|
||||
Reader blackboardReader = TextReader
|
||||
.getContentSpecificReader(content, null);
|
||||
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||
ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
|
||||
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
|
||||
.getContentSpecificExtractor(content, null);
|
||||
ingester.indexText(contentSpecificExtractor, content, null);
|
||||
} catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
|
||||
Reader contentReader = TextReader
|
||||
.getContentSpecificReader(content, null);
|
||||
ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
|
||||
} catch (TextReader.NoReaderFoundException | Ingester.IngesterException ex) {
|
||||
try {
|
||||
// Try the StringsTextExtractor if Tika extractions fails.
|
||||
ingester.indexText(TextExtractorFactory.getDefaultExtractor(null), content, null);
|
||||
ingester.indexText(TextReader.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
|
||||
} catch (Ingester.IngesterException ex1) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex1);
|
||||
}
|
||||
@ -444,11 +444,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
||||
final Ingester ingester = Ingester.getDefault();
|
||||
|
||||
try {
|
||||
TextExtractor<Content> contentSpecificExtractor =
|
||||
TextExtractorFactory.getContentSpecificExtractor((Content) artifact, null);
|
||||
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
|
||||
ingester.indexText(contentSpecificExtractor, artifact, null);
|
||||
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
|
||||
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||
Reader contentSpecificReader =
|
||||
TextReader.getContentSpecificReader((Content) artifact, null);
|
||||
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||
ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
|
||||
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
}
|
||||
|
@ -21,19 +21,15 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.tika.parser.txt.CharsetDetector;
|
||||
import org.apache.tika.parser.txt.CharsetMatch;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
|
||||
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extract text from .txt files
|
||||
*/
|
||||
final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
|
||||
final class TextFileExtractor {
|
||||
|
||||
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
|
||||
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
|
||||
@ -41,47 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
|
||||
//This limited information was used to select the current value as one that would filter out clearly non-text
|
||||
//files while hopefully working on all files with a valid text encoding
|
||||
static final private int MIN_MATCH_CONFIDENCE = 20;
|
||||
static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader(AbstractFile source) throws TextExtractorException {
|
||||
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
|
||||
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
|
||||
try {
|
||||
detector.setText(stream);
|
||||
} catch (IOException ex) {
|
||||
throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
||||
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
||||
}
|
||||
CharsetMatch match = detector.detect();
|
||||
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
||||
throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
||||
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
||||
}
|
||||
|
||||
return match.getReader();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void logWarning(String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setExtractionSettings(ExtractionContext context) {
|
||||
public class TextFileExtractorException extends Exception {
|
||||
public TextFileExtractorException(String msg, Throwable ex) {
|
||||
super(msg, ex);
|
||||
}
|
||||
public TextFileExtractorException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user