mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-12 07:56:16 +00:00
Complete overhaul of how extractors are currently implemented, hopefully converging to a more sensible solution
This commit is contained in:
parent
ce548fb978
commit
ece50a3a00
@ -22,7 +22,6 @@ import java.io.InputStream;
|
|||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.logging.Level;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||||
@ -35,39 +34,27 @@ import org.sleuthkit.datamodel.TskCoreException;
|
|||||||
* Extracts text from artifacts by concatenating the values of all of the
|
* Extracts text from artifacts by concatenating the values of all of the
|
||||||
* artifact's attributes.
|
* artifact's attributes.
|
||||||
*/
|
*/
|
||||||
class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||||
|
|
||||||
@Override
|
private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException {
|
||||||
public boolean isDisabled() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(final String msg, Exception ex) {
|
|
||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
|
||||||
}
|
|
||||||
|
|
||||||
private InputStream getInputStream(Content artifact) throws TextExtractorException {
|
|
||||||
BlackboardArtifact art = (BlackboardArtifact)artifact;
|
|
||||||
|
|
||||||
// Concatenate the string values of all attributes into a single
|
// Concatenate the string values of all attributes into a single
|
||||||
// "content" string to be indexed.
|
// "content" string to be indexed.
|
||||||
StringBuilder artifactContents = new StringBuilder();
|
StringBuilder artifactContents = new StringBuilder();
|
||||||
|
|
||||||
Content dataSource = null;
|
Content dataSource = null;
|
||||||
try {
|
try {
|
||||||
dataSource = art.getDataSource();
|
dataSource = artifact.getDataSource();
|
||||||
} catch (TskCoreException tskCoreException) {
|
} catch (TskCoreException tskCoreException) {
|
||||||
throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
|
throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
|
||||||
}
|
}
|
||||||
if (dataSource == null) {
|
if (dataSource == null) {
|
||||||
throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
|
throw new InitReaderException("Datasource was null for artifact: " + artifact.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (BlackboardAttribute attribute : art.getAttributes()) {
|
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||||
artifactContents.append(" : ");
|
artifactContents.append(" : ");
|
||||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||||
@ -85,40 +72,31 @@ class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
|||||||
artifactContents.append(System.lineSeparator());
|
artifactContents.append(System.lineSeparator());
|
||||||
}
|
}
|
||||||
} catch (TskCoreException tskCoreException) {
|
} catch (TskCoreException tskCoreException) {
|
||||||
throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
|
throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
|
||||||
}
|
}
|
||||||
|
|
||||||
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
|
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(Content source) throws TextExtractorException {
|
public Reader getReader(BlackboardArtifact source) throws InitReaderException {
|
||||||
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
/**
|
||||||
public long getID(Content source) {
|
* Configures this extractors to the settings stored in relevant config instances.
|
||||||
BlackboardArtifact art = (BlackboardArtifact)source;
|
*
|
||||||
return art.getArtifactID();
|
* This operation is a no-op since currently there are no configurable settings
|
||||||
}
|
* of the extraction process.
|
||||||
|
*
|
||||||
@Override
|
* @param context Instance containing file config settings
|
||||||
public String getName(Content source) {
|
*/
|
||||||
BlackboardArtifact art = (BlackboardArtifact)source;
|
|
||||||
return art.getDisplayName() + "_" + art.getArtifactID();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setExtractionSettings(ExtractionContext context) {
|
public void setExtractionSettings(ExtractionContext context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isContentTypeSpecific() {
|
public boolean isSupported(BlackboardArtifact file, String detectedFormat) {
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isSupported(Content file, String detectedFormat) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,130 +0,0 @@
|
|||||||
/*
|
|
||||||
* Autopsy Forensic Browser
|
|
||||||
*
|
|
||||||
* Copyright 2011-2018 Basis Technology Corp.
|
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.sleuthkit.autopsy.textextractors;
|
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableList;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.util.List;
|
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Common methods for utilities that extract text and content and divide into
|
|
||||||
* chunks
|
|
||||||
* @param <T>
|
|
||||||
*/
|
|
||||||
public abstract class ContentTextExtractor<T extends Content> implements TextExtractor<T> {
|
|
||||||
|
|
||||||
//Mimetype groups to aassist extractor implementations in ignoring binary and
|
|
||||||
//archive files.
|
|
||||||
public static final List<String> BINARY_MIME_TYPES
|
|
||||||
= ImmutableList.of(
|
|
||||||
//ignore binary blob data, for which string extraction will be used
|
|
||||||
"application/octet-stream", //NON-NLS
|
|
||||||
"application/x-msdownload"); //NON-NLS
|
|
||||||
|
|
||||||
/** generally text extractors should ignore archives and let unpacking
|
|
||||||
* modules take care of them */
|
|
||||||
public static final List<String> ARCHIVE_MIME_TYPES
|
|
||||||
= ImmutableList.of(
|
|
||||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
|
||||||
"application/x-7z-compressed", //NON-NLS
|
|
||||||
"application/x-ace-compressed", //NON-NLS
|
|
||||||
"application/x-alz-compressed", //NON-NLS
|
|
||||||
"application/x-arj", //NON-NLS
|
|
||||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
|
||||||
"application/x-cfs-compressed", //NON-NLS
|
|
||||||
"application/x-dgc-compressed", //NON-NLS
|
|
||||||
"application/x-apple-diskimage", //NON-NLS
|
|
||||||
"application/x-gca-compressed", //NON-NLS
|
|
||||||
"application/x-dar", //NON-NLS
|
|
||||||
"application/x-lzx", //NON-NLS
|
|
||||||
"application/x-lzh", //NON-NLS
|
|
||||||
"application/x-rar-compressed", //NON-NLS
|
|
||||||
"application/x-stuffit", //NON-NLS
|
|
||||||
"application/x-stuffitx", //NON-NLS
|
|
||||||
"application/x-gtar", //NON-NLS
|
|
||||||
"application/x-archive", //NON-NLS
|
|
||||||
"application/x-executable", //NON-NLS
|
|
||||||
"application/x-gzip", //NON-NLS
|
|
||||||
"application/zip", //NON-NLS
|
|
||||||
"application/x-zoo", //NON-NLS
|
|
||||||
"application/x-cpio", //NON-NLS
|
|
||||||
"application/x-shar", //NON-NLS
|
|
||||||
"application/x-tar", //NON-NLS
|
|
||||||
"application/x-bzip", //NON-NLS
|
|
||||||
"application/x-bzip2", //NON-NLS
|
|
||||||
"application/x-lzip", //NON-NLS
|
|
||||||
"application/x-lzma", //NON-NLS
|
|
||||||
"application/x-lzop", //NON-NLS
|
|
||||||
"application/x-z", //NON-NLS
|
|
||||||
"application/x-compress"); //NON-NLS
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if the extractor works only for specified types is
|
|
||||||
* supportedTypes() or whether is a generic content extractor (such as
|
|
||||||
* string extractor)
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public abstract boolean isContentTypeSpecific();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if the file content is supported by the extractor if
|
|
||||||
* isContentTypeSpecific() returns true.
|
|
||||||
*
|
|
||||||
* @param file to test if its content should be supported
|
|
||||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
|
||||||
* or null if not detected
|
|
||||||
*
|
|
||||||
* @return true if the file content is supported, false otherwise
|
|
||||||
*/
|
|
||||||
public abstract boolean isSupported(T file, String detectedFormat);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a reader that will iterate over the text of the source content.
|
|
||||||
*
|
|
||||||
* @param source Content source to read
|
|
||||||
* @return A reader that contains all source text
|
|
||||||
* @throws TextExtractorException Error encountered during extraction
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public abstract Reader getReader(T source) throws TextExtractorException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the object id of the content source.
|
|
||||||
*
|
|
||||||
* @param source source content
|
|
||||||
* @return object id associated with this source content
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public long getID(T source) {
|
|
||||||
return source.getId();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the human-readable name of the given content source.
|
|
||||||
*
|
|
||||||
* @param source source content
|
|
||||||
* @return name of source content
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String getName(T source) {
|
|
||||||
return source.getName();
|
|
||||||
}
|
|
||||||
}
|
|
@ -23,7 +23,6 @@ import java.io.Reader;
|
|||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.logging.Level;
|
|
||||||
import net.htmlparser.jericho.Attributes;
|
import net.htmlparser.jericho.Attributes;
|
||||||
import net.htmlparser.jericho.Config;
|
import net.htmlparser.jericho.Config;
|
||||||
import net.htmlparser.jericho.LoggerProvider;
|
import net.htmlparser.jericho.LoggerProvider;
|
||||||
@ -33,13 +32,12 @@ import net.htmlparser.jericho.StartTag;
|
|||||||
import net.htmlparser.jericho.StartTagType;
|
import net.htmlparser.jericho.StartTagType;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts text from HTML content.
|
* Extracts text from HTML content.
|
||||||
*/
|
*/
|
||||||
final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||||
private final int MAX_SIZE;
|
private final int MAX_SIZE;
|
||||||
@ -67,19 +65,6 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
MAX_SIZE = 50_000_000;
|
MAX_SIZE = 50_000_000;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this extractor is responsible for extracting only a
|
|
||||||
* specific type of media.
|
|
||||||
*
|
|
||||||
* In this case, only HTML documents can be read successfully.
|
|
||||||
*
|
|
||||||
* @return true
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isContentTypeSpecific() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines if this content type is supported by this extractor.
|
* Determines if this content type is supported by this extractor.
|
||||||
*
|
*
|
||||||
@ -89,7 +74,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
* @return flag indicating support
|
* @return flag indicating support
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isSupported(Content content, String detectedFormat) {
|
public boolean isSupported(AbstractFile content, String detectedFormat) {
|
||||||
return detectedFormat != null
|
return detectedFormat != null
|
||||||
&& WEB_MIME_TYPES.contains(detectedFormat)
|
&& WEB_MIME_TYPES.contains(detectedFormat)
|
||||||
&& content.getSize() <= MAX_SIZE;
|
&& content.getSize() <= MAX_SIZE;
|
||||||
@ -105,7 +90,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
* @throws TextExtractorException
|
* @throws TextExtractorException
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(Content content) throws TextExtractorException {
|
public Reader getReader(AbstractFile content) throws InitReaderException {
|
||||||
//TODO JIRA-4467, there is only harm in excluding HTML documents greater
|
//TODO JIRA-4467, there is only harm in excluding HTML documents greater
|
||||||
//than 50MB due to our troubled approach of extraction.
|
//than 50MB due to our troubled approach of extraction.
|
||||||
ReadContentInputStream stream = new ReadContentInputStream(content);
|
ReadContentInputStream stream = new ReadContentInputStream(content);
|
||||||
@ -201,25 +186,10 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
// All done, now make it a reader
|
// All done, now make it a reader
|
||||||
return new StringReader(stringBuilder.toString());
|
return new StringReader(stringBuilder.toString());
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
throw new TextExtractorException("Error extracting HTML from content.", ex);
|
throw new InitReaderException("Error extracting HTML from content.", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Indicates if this extractor can run.
|
|
||||||
*
|
|
||||||
* @return Flag indicating if this extractor can run.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(final String msg, Exception ex) {
|
|
||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines how the extraction process will proceed given the settings
|
* Determines how the extraction process will proceed given the settings
|
||||||
* stored in this context instance.
|
* stored in this context instance.
|
||||||
|
@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
|
|||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
|
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts text from SQLite database files.
|
* Extracts text from SQLite database files.
|
||||||
@ -39,48 +38,10 @@ import org.sleuthkit.datamodel.Content;
|
|||||||
* 2) Tables that contain spaces in their name are not extracted
|
* 2) Tables that contain spaces in their name are not extracted
|
||||||
* 3) Table names are not included in its output text
|
* 3) Table names are not included in its output text
|
||||||
*/
|
*/
|
||||||
final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
|
||||||
|
|
||||||
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
||||||
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
||||||
private static boolean isDisabled;
|
|
||||||
|
|
||||||
static {
|
|
||||||
try {
|
|
||||||
Class.forName("org.sqlite.JDBC");
|
|
||||||
isDisabled = false;
|
|
||||||
} catch (ClassNotFoundException ex) {
|
|
||||||
logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
|
|
||||||
+ "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
|
|
||||||
isDisabled = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This extractor only works for sqlite files, so it is indeed content type
|
|
||||||
* specific.
|
|
||||||
*
|
|
||||||
* @return true
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isContentTypeSpecific() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this extractor is fit to run.
|
|
||||||
*
|
|
||||||
* @return Flag indicating if it should or shouldn't be run.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
return isDisabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(String msg, Exception exception) {
|
|
||||||
logger.log(Level.WARNING, msg, exception); //NON-NLS
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Supports only the sqlite mimetypes
|
* Supports only the sqlite mimetypes
|
||||||
@ -91,7 +52,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
|
|||||||
* @return true if x-sqlite3
|
* @return true if x-sqlite3
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isSupported(Content file, String detectedFormat) {
|
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||||
return SQLITE_MIMETYPE.equals(detectedFormat);
|
return SQLITE_MIMETYPE.equals(detectedFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,12 +66,8 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
|
|||||||
* @throws TextExtractorException
|
* @throws TextExtractorException
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(Content source) throws TextExtractorException {
|
public Reader getReader(AbstractFile source) throws InitReaderException {
|
||||||
if(source instanceof AbstractFile) {
|
return new SQLiteStreamReader(source);
|
||||||
return new SQLiteStreamReader((AbstractFile)source);
|
|
||||||
}
|
|
||||||
throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type"
|
|
||||||
+ " AbstractFile.", source.getName(), source.getId()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -25,7 +25,6 @@ import java.nio.charset.Charset;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.logging.Level;
|
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
@ -37,41 +36,12 @@ import org.sleuthkit.datamodel.TskException;
|
|||||||
/**
|
/**
|
||||||
* Extracts raw strings from content.
|
* Extracts raw strings from content.
|
||||||
*/
|
*/
|
||||||
final class StringsTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
final class StringsTextExtractor {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
|
||||||
private boolean extractUTF8;
|
private boolean extractUTF8;
|
||||||
private boolean extractUTF16;
|
private boolean extractUTF16;
|
||||||
private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
|
private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this extractor may only read particular types of content.
|
|
||||||
*
|
|
||||||
* Since Strings may be run on any content type, it is not content specific.
|
|
||||||
*
|
|
||||||
* @return false
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isContentTypeSpecific() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this extractor can read the content type.
|
|
||||||
*
|
|
||||||
* Note: Strings can be run on any type of content, so all types will return
|
|
||||||
* true.
|
|
||||||
*
|
|
||||||
* @param file Content source to read
|
|
||||||
* @param detectedFormat Mimetype of source file.
|
|
||||||
*
|
|
||||||
* @return true
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isSupported(Content file, String detectedFormat) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -99,33 +69,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
|
|||||||
this.extractScripts.addAll(extractScripts);
|
this.extractScripts.addAll(extractScripts);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the currently used scripts for extraction
|
|
||||||
*
|
|
||||||
* @return scripts currently used or null if not supported
|
|
||||||
*/
|
|
||||||
public List<SCRIPT> getScripts() {
|
|
||||||
return new ArrayList<>(extractScripts);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(final String msg, Exception ex) {
|
|
||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this extractor should be run or not.
|
|
||||||
*
|
|
||||||
* Atleast one of the extraction encodings in DefaultExtractionConfig must
|
|
||||||
* be set for this extractor to run.
|
|
||||||
*
|
|
||||||
* @return Flag indicating if this extractor should be run.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
return extractUTF8 == false && extractUTF16 == false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a reader that will iterate over the text of the content source.
|
* Returns a reader that will iterate over the text of the content source.
|
||||||
*
|
*
|
||||||
@ -136,8 +79,7 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
|
|||||||
* @throws
|
* @throws
|
||||||
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
||||||
*/
|
*/
|
||||||
@Override
|
public InputStreamReader getReader(Content content) {
|
||||||
public InputStreamReader getReader(Content content) throws TextExtractorException {
|
|
||||||
InputStream stringStream = getInputStream(content);
|
InputStream stringStream = getInputStream(content);
|
||||||
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
|
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
|
||||||
}
|
}
|
||||||
@ -160,7 +102,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
|
|||||||
*
|
*
|
||||||
* @param context Instance containing config classes
|
* @param context Instance containing config classes
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public void setExtractionSettings(ExtractionContext context) {
|
public void setExtractionSettings(ExtractionContext context) {
|
||||||
if (context != null && context.contains(DefaultExtractionConfig.class)) {
|
if (context != null && context.contains(DefaultExtractionConfig.class)) {
|
||||||
DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
|
DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);
|
||||||
|
@ -19,7 +19,6 @@
|
|||||||
package org.sleuthkit.autopsy.textextractors;
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
|
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
|
||||||
@ -28,23 +27,19 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
|||||||
* @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
|
* @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
|
||||||
* process.
|
* process.
|
||||||
*/
|
*/
|
||||||
public interface TextExtractor<T extends SleuthkitVisitableItem> {
|
interface TextExtractor<T> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is this extractor configured such that no extraction will/should be done?
|
* Determines if the file content is supported by the extractor if
|
||||||
|
* isContentTypeSpecific() returns true.
|
||||||
*
|
*
|
||||||
* @return True if this extractor will/should not perform any extraction.
|
* @param file to test if its content should be supported
|
||||||
*/
|
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||||
boolean isDisabled();
|
* or null if not detected
|
||||||
|
|
||||||
/**
|
|
||||||
* Log the given message and exception as a warning.
|
|
||||||
*
|
*
|
||||||
* @param msg Log message
|
* @return true if the file content is supported, false otherwise
|
||||||
* @param ex Exception associated with the incoming message
|
|
||||||
*/
|
*/
|
||||||
void logWarning(String msg, Exception ex);
|
public abstract boolean isSupported(T file, String detectedFormat);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get a reader that will iterate over the text extracted from the given
|
* Get a reader that will iterate over the text extracted from the given
|
||||||
* source.
|
* source.
|
||||||
@ -53,28 +48,8 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
|
|||||||
*
|
*
|
||||||
* @return Reader instance that contains the text of the source
|
* @return Reader instance that contains the text of the source
|
||||||
*
|
*
|
||||||
* @throws TextExtractorException
|
|
||||||
*/
|
*/
|
||||||
Reader getReader(T source) throws TextExtractorException;
|
Reader getReader(T source) throws InitReaderException;
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the 'object' id of the given source.
|
|
||||||
*
|
|
||||||
* @param source Source content of type T
|
|
||||||
*
|
|
||||||
* @return Object id of the source content
|
|
||||||
*/
|
|
||||||
long getID(T source);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a human readable name for the given source.
|
|
||||||
*
|
|
||||||
* @param source Source content of type T
|
|
||||||
*
|
|
||||||
* @return Name of the content source
|
|
||||||
*/
|
|
||||||
String getName(T source);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines how the extraction process will proceed given the settings
|
* Determines how the extraction process will proceed given the settings
|
||||||
@ -86,17 +61,17 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
|
|||||||
*/
|
*/
|
||||||
void setExtractionSettings(ExtractionContext context);
|
void setExtractionSettings(ExtractionContext context);
|
||||||
|
|
||||||
/**
|
public class InitReaderException extends Exception {
|
||||||
* System exception for dealing with errors encountered during extraction.
|
public InitReaderException(String msg, Throwable ex) {
|
||||||
*/
|
super(msg, ex);
|
||||||
class TextExtractorException extends Exception {
|
|
||||||
|
|
||||||
public TextExtractorException(String message) {
|
|
||||||
super(message);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public TextExtractorException(String message, Throwable cause) {
|
public InitReaderException(Throwable ex) {
|
||||||
super(message, cause);
|
super(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
public InitReaderException(String msg) {
|
||||||
|
super(msg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.textextractors;
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
@ -31,8 +32,13 @@ import org.sleuthkit.datamodel.Report;
|
|||||||
* See ContentTextExtractor interface for the generic structure of such
|
* See ContentTextExtractor interface for the generic structure of such
|
||||||
* extractors.
|
* extractors.
|
||||||
*/
|
*/
|
||||||
public class TextExtractorFactory {
|
public class TextReader {
|
||||||
|
|
||||||
|
private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
|
||||||
|
new HtmlTextExtractor<>(),
|
||||||
|
new SqliteTextExtractor<>(),
|
||||||
|
new TikaTextExtractor<>()
|
||||||
|
);
|
||||||
/**
|
/**
|
||||||
* Auto detects the correct text extractor given the file.
|
* Auto detects the correct text extractor given the file.
|
||||||
*
|
*
|
||||||
@ -41,40 +47,42 @@ public class TextExtractorFactory {
|
|||||||
* will keep the extractors at default settings. Refer to the
|
* will keep the extractors at default settings. Refer to the
|
||||||
* extractionconfigs package for available file configurations.
|
* extractionconfigs package for available file configurations.
|
||||||
*
|
*
|
||||||
* @param <T> Type of source content
|
|
||||||
* @param file Content source that will be read from
|
* @param file Content source that will be read from
|
||||||
* @param context Contains extraction configurations for certain file types
|
* @param context Contains extraction configurations for certain file types
|
||||||
*
|
*
|
||||||
* @return A ContentTextExtractor instance that is properly configured and
|
* @return A ContentTextExtractor instance that is properly configured and
|
||||||
* can be read from the getReader() method.
|
* can be read from the getReader() method.
|
||||||
*
|
*
|
||||||
* @throws NoContentSpecificExtractorException In the event that the
|
* @throws NoReaderFoundException In the event that the
|
||||||
* inputted file and mimetype
|
* inputted file and mimetype
|
||||||
* have no corresponding
|
* have no corresponding
|
||||||
* extractor
|
* extractor
|
||||||
*/
|
*/
|
||||||
public static <T extends Content> ContentTextExtractor<T> getContentSpecificExtractor(T file,
|
public static Reader getContentSpecificReader(Content file,
|
||||||
ExtractionContext context) throws NoContentSpecificExtractorException {
|
ExtractionContext context) throws NoReaderFoundException {
|
||||||
|
try {
|
||||||
if (file instanceof AbstractFile) {
|
if (file instanceof AbstractFile) {
|
||||||
List<ContentTextExtractor<T>> fileExtractors = getAbstractFileExtractors();
|
|
||||||
String mimeType = ((AbstractFile) file).getMIMEType();
|
String mimeType = ((AbstractFile) file).getMIMEType();
|
||||||
for (ContentTextExtractor<T> candidate : fileExtractors) {
|
for (TextExtractor<AbstractFile> candidate : fileExtractors) {
|
||||||
candidate.setExtractionSettings(context);
|
candidate.setExtractionSettings(context);
|
||||||
if (candidate.isSupported(file, mimeType)) {
|
if (candidate.isSupported((AbstractFile)file, mimeType)) {
|
||||||
return candidate;
|
return candidate.getReader((AbstractFile)file);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (file instanceof BlackboardArtifact) {
|
} else if (file instanceof BlackboardArtifact) {
|
||||||
ContentTextExtractor<T> artifactExtractor = new ArtifactTextExtractor<>();
|
TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor<>();
|
||||||
artifactExtractor.setExtractionSettings(context);
|
artifactExtractor.setExtractionSettings(context);
|
||||||
return artifactExtractor;
|
return artifactExtractor.getReader((BlackboardArtifact)file);
|
||||||
} else if (file instanceof Report) {
|
} else if (file instanceof Report) {
|
||||||
ContentTextExtractor<T> reportExtractor = new TikaTextExtractor<>();
|
TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
|
||||||
reportExtractor.setExtractionSettings(context);
|
reportExtractor.setExtractionSettings(context);
|
||||||
return reportExtractor;
|
reportExtractor.getReader((Report)file);
|
||||||
|
}
|
||||||
|
} catch (TextExtractor.InitReaderException ex) {
|
||||||
|
throw new NoReaderFoundException(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new NoContentSpecificExtractorException(
|
throw new NoReaderFoundException(
|
||||||
String.format("Could not find a suitable extractor for "
|
String.format("Could not find a suitable extractor for "
|
||||||
+ "file with name [%s] and id=[%d]. Try using the default, "
|
+ "file with name [%s] and id=[%d]. Try using the default, "
|
||||||
+ "non content specific extractor as an alternative.",
|
+ "non content specific extractor as an alternative.",
|
||||||
@ -82,43 +90,34 @@ public class TextExtractorFactory {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates and returns a list of all of the known abstract file
|
|
||||||
* extractors.
|
|
||||||
*
|
|
||||||
* @return A list of specialized ContentTextExtractors
|
|
||||||
*/
|
|
||||||
private static <T extends Content> List<ContentTextExtractor<T>> getAbstractFileExtractors() {
|
|
||||||
return Arrays.asList(
|
|
||||||
new HtmlTextExtractor<>(),
|
|
||||||
new SqliteTextExtractor<>(),
|
|
||||||
new TikaTextExtractor<>()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the default extractor that can be run on any content type. This
|
* Returns the default extractor that can be run on any content type. This
|
||||||
* extractor should be used as a backup in the event that no specialized
|
* extractor should be used as a backup in the event that no specialized
|
||||||
* extractor can be found.
|
* extractor can be found.
|
||||||
*
|
*
|
||||||
|
* @param source
|
||||||
* @param context Contains extraction configurations for certain file types
|
* @param context Contains extraction configurations for certain file types
|
||||||
*
|
*
|
||||||
* @return A DefaultExtractor instance
|
* @return A DefaultExtractor instance
|
||||||
*/
|
*/
|
||||||
public static ContentTextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
|
public static Reader getDefaultReader(Content source, ExtractionContext context) {
|
||||||
ContentTextExtractor<Content> stringsInstance = new StringsTextExtractor<>();
|
StringsTextExtractor stringsInstance = new StringsTextExtractor();
|
||||||
stringsInstance.setExtractionSettings(context);
|
stringsInstance.setExtractionSettings(context);
|
||||||
return stringsInstance;
|
return stringsInstance.getReader(source);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* System level exception for handling content types that have no specific
|
* System level exception for handling content types that have no specific
|
||||||
* strategy defined for extracting their text.
|
* strategy defined for extracting their text.
|
||||||
*/
|
*/
|
||||||
public static class NoContentSpecificExtractorException extends Exception {
|
public static class NoReaderFoundException extends Exception {
|
||||||
|
|
||||||
public NoContentSpecificExtractorException(String msg) {
|
public NoReaderFoundException(String msg) {
|
||||||
super(msg);
|
super(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public NoReaderFoundException(Throwable ex) {
|
||||||
|
super(ex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -18,6 +18,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.textextractors;
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
import com.google.common.io.CharSource;
|
import com.google.common.io.CharSource;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -54,7 +55,53 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
* Extracts text from Tika supported content. Protects against Tika
|
* Extracts text from Tika supported content. Protects against Tika
|
||||||
* parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
* parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
||||||
*/
|
*/
|
||||||
final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T> {
|
final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
|
||||||
|
|
||||||
|
//Mimetype groups to aassist extractor implementations in ignoring binary and
|
||||||
|
//archive files.
|
||||||
|
private static final List<String> BINARY_MIME_TYPES
|
||||||
|
= ImmutableList.of(
|
||||||
|
//ignore binary blob data, for which string extraction will be used
|
||||||
|
"application/octet-stream", //NON-NLS
|
||||||
|
"application/x-msdownload"); //NON-NLS
|
||||||
|
|
||||||
|
/** generally text extractors should ignore archives and let unpacking
|
||||||
|
* modules take care of them */
|
||||||
|
private static final List<String> ARCHIVE_MIME_TYPES
|
||||||
|
= ImmutableList.of(
|
||||||
|
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||||
|
"application/x-7z-compressed", //NON-NLS
|
||||||
|
"application/x-ace-compressed", //NON-NLS
|
||||||
|
"application/x-alz-compressed", //NON-NLS
|
||||||
|
"application/x-arj", //NON-NLS
|
||||||
|
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||||
|
"application/x-cfs-compressed", //NON-NLS
|
||||||
|
"application/x-dgc-compressed", //NON-NLS
|
||||||
|
"application/x-apple-diskimage", //NON-NLS
|
||||||
|
"application/x-gca-compressed", //NON-NLS
|
||||||
|
"application/x-dar", //NON-NLS
|
||||||
|
"application/x-lzx", //NON-NLS
|
||||||
|
"application/x-lzh", //NON-NLS
|
||||||
|
"application/x-rar-compressed", //NON-NLS
|
||||||
|
"application/x-stuffit", //NON-NLS
|
||||||
|
"application/x-stuffitx", //NON-NLS
|
||||||
|
"application/x-gtar", //NON-NLS
|
||||||
|
"application/x-archive", //NON-NLS
|
||||||
|
"application/x-executable", //NON-NLS
|
||||||
|
"application/x-gzip", //NON-NLS
|
||||||
|
"application/zip", //NON-NLS
|
||||||
|
"application/x-zoo", //NON-NLS
|
||||||
|
"application/x-cpio", //NON-NLS
|
||||||
|
"application/x-shar", //NON-NLS
|
||||||
|
"application/x-tar", //NON-NLS
|
||||||
|
"application/x-bzip", //NON-NLS
|
||||||
|
"application/x-bzip2", //NON-NLS
|
||||||
|
"application/x-lzip", //NON-NLS
|
||||||
|
"application/x-lzma", //NON-NLS
|
||||||
|
"application/x-lzop", //NON-NLS
|
||||||
|
"application/x-z", //NON-NLS
|
||||||
|
"application/x-compress"); //NON-NLS
|
||||||
|
|
||||||
|
|
||||||
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
||||||
|
|
||||||
@ -74,11 +121,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
.map(mt -> mt.getType() + "/" + mt.getSubtype())
|
.map(mt -> mt.getType() + "/" + mt.getSubtype())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(final String msg, Exception ex) {
|
|
||||||
tikaLogger.log(Level.WARNING, msg, ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a reader that will iterate over the text extracted from Apache
|
* Returns a reader that will iterate over the text extracted from Apache
|
||||||
* Tika.
|
* Tika.
|
||||||
@ -89,7 +131,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
* @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
* @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(Content content) throws TextExtractorException {
|
public Reader getReader(Content content) throws InitReaderException {
|
||||||
ReadContentInputStream stream = new ReadContentInputStream(content);
|
ReadContentInputStream stream = new ReadContentInputStream(content);
|
||||||
|
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
@ -136,7 +178,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
PushbackReader pushbackReader = new PushbackReader(tikaReader);
|
PushbackReader pushbackReader = new PushbackReader(tikaReader);
|
||||||
int read = pushbackReader.read();
|
int read = pushbackReader.read();
|
||||||
if (read == -1) {
|
if (read == -1) {
|
||||||
throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
|
throw new InitReaderException("Unable to extract text: Tika returned empty reader for " + content);
|
||||||
}
|
}
|
||||||
pushbackReader.unread(read);
|
pushbackReader.unread(read);
|
||||||
|
|
||||||
@ -145,15 +187,13 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
|
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
|
||||||
} catch (TimeoutException te) {
|
} catch (TimeoutException te) {
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
|
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
|
||||||
logWarning(msg, te);
|
throw new InitReaderException(msg, te);
|
||||||
throw new TextExtractorException(msg, te);
|
} catch (InitReaderException ex) {
|
||||||
} catch (TextExtractorException ex) {
|
|
||||||
throw ex;
|
throw ex;
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
|
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
|
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
|
||||||
logWarning(msg, ex);
|
throw new InitReaderException(msg, ex);
|
||||||
throw new TextExtractorException(msg, ex);
|
|
||||||
} finally {
|
} finally {
|
||||||
future.cancel(true);
|
future.cancel(true);
|
||||||
}
|
}
|
||||||
@ -199,19 +239,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this extractor only understands a specifc type of content.
|
|
||||||
*
|
|
||||||
* Although Apache Tika is defined for many input types, it is still a content
|
|
||||||
* specific approach to extraction.
|
|
||||||
*
|
|
||||||
* @return true
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isContentTypeSpecific() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines if Tika is supported for this content type and mimetype.
|
* Determines if Tika is supported for this content type and mimetype.
|
||||||
*
|
*
|
||||||
@ -222,8 +249,8 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
@Override
|
@Override
|
||||||
public boolean isSupported(Content content, String detectedFormat) {
|
public boolean isSupported(Content content, String detectedFormat) {
|
||||||
if (detectedFormat == null
|
if (detectedFormat == null
|
||||||
|| ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
|| BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||||
|| ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
|| ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||||
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||||
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
|
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
|
||||||
) {
|
) {
|
||||||
@ -232,19 +259,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
|
|||||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this extractor can be run.
|
|
||||||
*
|
|
||||||
* So long as Tika's dependencies are present, this extractor can run
|
|
||||||
* no matter the circumstance.
|
|
||||||
*
|
|
||||||
* @return true
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isDisabled() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return timeout that should be used to index the content.
|
* Return timeout that should be used to index the content.
|
||||||
*
|
*
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
@ -32,7 +33,6 @@ import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
|
|||||||
import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
|
import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
|
import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
|
||||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
@ -106,8 +106,8 @@ class Ingester {
|
|||||||
* @throws IngesterException if there was an error processing a specific
|
* @throws IngesterException if there was an error processing a specific
|
||||||
* artifact, but the Solr server is probably fine.
|
* artifact, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void indexMetaDataOnly(BlackboardArtifact artifact, TextExtractor<Content> extractor) throws IngesterException {
|
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
||||||
indexChunk("", extractor.getName(artifact), getContentFields(artifact));
|
indexChunk("", sourceName, getContentFields(artifact));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -142,23 +142,12 @@ class Ingester {
|
|||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
|
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
|
||||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||||
final long sourceID = extractor.getID(source);
|
|
||||||
final String sourceName = extractor.getName(source);
|
|
||||||
|
|
||||||
int numChunks = 0; //unknown until chunking is done
|
int numChunks = 0; //unknown until chunking is done
|
||||||
|
|
||||||
if (extractor.isDisabled()) {
|
|
||||||
/*
|
|
||||||
* some Extractors, notable the strings extractor, have options
|
|
||||||
* which can be configured such that no extraction should be done
|
|
||||||
*/
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Map<String, String> fields = getContentFields(source);
|
Map<String, String> fields = getContentFields(source);
|
||||||
//Get a reader for the content of the given source
|
//Get a reader for the content of the given source
|
||||||
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
|
try (BufferedReader reader = new BufferedReader(sourceReader)) {
|
||||||
Chunker chunker = new Chunker(reader);
|
Chunker chunker = new Chunker(reader);
|
||||||
for (Chunk chunk : chunker) {
|
for (Chunk chunk : chunker) {
|
||||||
if (context != null && context.fileIngestIsCancelled()) {
|
if (context != null && context.fileIngestIsCancelled()) {
|
||||||
@ -173,18 +162,18 @@ class Ingester {
|
|||||||
indexChunk(chunk.toString(), sourceName, fields);
|
indexChunk(chunk.toString(), sourceName, fields);
|
||||||
numChunks++;
|
numChunks++;
|
||||||
} catch (Ingester.IngesterException ingEx) {
|
} catch (Ingester.IngesterException ingEx) {
|
||||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
|
||||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||||
|
|
||||||
throw ingEx; //need to rethrow to signal error and move on
|
throw ingEx; //need to rethrow to signal error and move on
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (chunker.hasException()) {
|
if (chunker.hasException()) {
|
||||||
extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
|
logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
return false;
|
return false;
|
||||||
} finally {
|
} finally {
|
||||||
if (context != null && context.fileIngestIsCancelled()) {
|
if (context != null && context.fileIngestIsCancelled()) {
|
||||||
|
@ -18,12 +18,14 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
|
import org.openide.util.Exceptions;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.util.NbBundle.Messages;
|
import org.openide.util.NbBundle.Messages;
|
||||||
import org.sleuthkit.autopsy.casemodule.Case;
|
import org.sleuthkit.autopsy.casemodule.Case;
|
||||||
@ -37,16 +39,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
|
|||||||
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestServices;
|
import org.sleuthkit.autopsy.ingest.IngestServices;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||||
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
|
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
|
||||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
import org.sleuthkit.autopsy.textextractors.TextReader;
|
||||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
|
||||||
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
|
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
|
||||||
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
|
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
import org.sleuthkit.datamodel.TskData;
|
import org.sleuthkit.datamodel.TskData;
|
||||||
import org.sleuthkit.datamodel.TskData.FileKnown;
|
import org.sleuthkit.datamodel.TskData.FileKnown;
|
||||||
|
|
||||||
@ -68,6 +69,43 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
|
|||||||
})
|
})
|
||||||
public final class KeywordSearchIngestModule implements FileIngestModule {
|
public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||||
|
|
||||||
|
/** generally text extractors should ignore archives and let unpacking
|
||||||
|
* modules take care of them */
|
||||||
|
public static final List<String> ARCHIVE_MIME_TYPES
|
||||||
|
= ImmutableList.of(
|
||||||
|
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||||
|
"application/x-7z-compressed", //NON-NLS
|
||||||
|
"application/x-ace-compressed", //NON-NLS
|
||||||
|
"application/x-alz-compressed", //NON-NLS
|
||||||
|
"application/x-arj", //NON-NLS
|
||||||
|
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||||
|
"application/x-cfs-compressed", //NON-NLS
|
||||||
|
"application/x-dgc-compressed", //NON-NLS
|
||||||
|
"application/x-apple-diskimage", //NON-NLS
|
||||||
|
"application/x-gca-compressed", //NON-NLS
|
||||||
|
"application/x-dar", //NON-NLS
|
||||||
|
"application/x-lzx", //NON-NLS
|
||||||
|
"application/x-lzh", //NON-NLS
|
||||||
|
"application/x-rar-compressed", //NON-NLS
|
||||||
|
"application/x-stuffit", //NON-NLS
|
||||||
|
"application/x-stuffitx", //NON-NLS
|
||||||
|
"application/x-gtar", //NON-NLS
|
||||||
|
"application/x-archive", //NON-NLS
|
||||||
|
"application/x-executable", //NON-NLS
|
||||||
|
"application/x-gzip", //NON-NLS
|
||||||
|
"application/zip", //NON-NLS
|
||||||
|
"application/x-zoo", //NON-NLS
|
||||||
|
"application/x-cpio", //NON-NLS
|
||||||
|
"application/x-shar", //NON-NLS
|
||||||
|
"application/x-tar", //NON-NLS
|
||||||
|
"application/x-bzip", //NON-NLS
|
||||||
|
"application/x-bzip2", //NON-NLS
|
||||||
|
"application/x-lzip", //NON-NLS
|
||||||
|
"application/x-lzma", //NON-NLS
|
||||||
|
"application/x-lzop", //NON-NLS
|
||||||
|
"application/x-z", //NON-NLS
|
||||||
|
"application/x-compress"); //NON-NLS
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Options for this extractor
|
* Options for this extractor
|
||||||
*/
|
*/
|
||||||
@ -104,7 +142,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
//accessed read-only by searcher thread
|
//accessed read-only by searcher thread
|
||||||
|
|
||||||
private boolean startedSearching = false;
|
private boolean startedSearching = false;
|
||||||
private TextExtractor<Content> stringExtractor;
|
private ExtractionContext stringsExtractionContext;
|
||||||
private final KeywordSearchJobSettings settings;
|
private final KeywordSearchJobSettings settings;
|
||||||
private boolean initialized = false;
|
private boolean initialized = false;
|
||||||
private long jobId;
|
private long jobId;
|
||||||
@ -250,7 +288,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ExtractionContext extractionContext = new ExtractionContext();
|
stringsExtractionContext = new ExtractionContext();
|
||||||
|
|
||||||
DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
|
DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
|
||||||
Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
|
Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
|
||||||
@ -258,9 +296,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
|
stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
|
||||||
stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
|
stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
|
||||||
|
|
||||||
extractionContext.set(DefaultExtractionConfig.class, stringsConfig);
|
stringsExtractionContext.set(DefaultExtractionConfig.class, stringsConfig);
|
||||||
|
|
||||||
stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
|
|
||||||
indexer = new Indexer();
|
indexer = new Indexer();
|
||||||
initialized = true;
|
initialized = true;
|
||||||
}
|
}
|
||||||
@ -352,7 +389,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
* Common cleanup code when module stops or final searcher completes
|
* Common cleanup code when module stops or final searcher completes
|
||||||
*/
|
*/
|
||||||
private void cleanup() {
|
private void cleanup() {
|
||||||
stringExtractor = null;
|
stringsExtractionContext = null;
|
||||||
initialized = false;
|
initialized = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -440,7 +477,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
* @throws IngesterException exception thrown if indexing failed
|
* @throws IngesterException exception thrown if indexing failed
|
||||||
*/
|
*/
|
||||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||||
TextExtractor<Content> extractor = null;
|
|
||||||
ExtractionContext extractionContext = new ExtractionContext();
|
ExtractionContext extractionContext = new ExtractionContext();
|
||||||
|
|
||||||
ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
|
ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
|
||||||
@ -448,10 +484,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
|
extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
extractor = TextExtractorFactory.getContentSpecificExtractor(aFile,extractionContext);
|
Reader specializedReader = TextReader.getContentSpecificReader(aFile,extractionContext);
|
||||||
//divide into chunks and index
|
//divide into chunks and index
|
||||||
return Ingester.getDefault().indexText(extractor, aFile, context);
|
return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
|
||||||
} catch (TextExtractorFactory.NoContentSpecificExtractorException ex) {
|
} catch (TextReader.NoReaderFoundException ex) {
|
||||||
//No text extractor found... run the default instead
|
//No text extractor found... run the default instead
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -470,7 +506,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
if (context.fileIngestIsCancelled()) {
|
if (context.fileIngestIsCancelled()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
|
Reader stringsReader = TextReader.getDefaultReader(aFile, stringsExtractionContext);
|
||||||
|
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
@ -530,7 +567,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
|
|
||||||
// we skip archive formats that are opened by the archive module.
|
// we skip archive formats that are opened by the archive module.
|
||||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||||
if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
if (ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||||
try {
|
try {
|
||||||
if (context.fileIngestIsCancelled()) {
|
if (context.fileIngestIsCancelled()) {
|
||||||
return;
|
return;
|
||||||
@ -579,11 +616,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
||||||
try {
|
try {
|
||||||
TextFileExtractor textFileExtractor = new TextFileExtractor();
|
TextFileExtractor textFileExtractor = new TextFileExtractor();
|
||||||
if (Ingester.getDefault().indexText(textFileExtractor, aFile, context)) {
|
Reader textReader = textFileExtractor.getReader(aFile);
|
||||||
|
if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
||||||
wasTextAdded = true;
|
wasTextAdded = true;
|
||||||
}
|
}
|
||||||
} catch (IngesterException ex) {
|
} catch (IngesterException | TextFileExtractorException ex) {
|
||||||
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.lang.reflect.InvocationTargetException;
|
import java.lang.reflect.InvocationTargetException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -33,7 +34,6 @@ import org.apache.commons.lang.math.NumberUtils;
|
|||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||||
import org.openide.util.Exceptions;
|
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.util.lookup.ServiceProvider;
|
import org.openide.util.lookup.ServiceProvider;
|
||||||
import org.openide.util.lookup.ServiceProviders;
|
import org.openide.util.lookup.ServiceProviders;
|
||||||
@ -46,8 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
|
|||||||
import org.sleuthkit.autopsy.progress.ProgressIndicator;
|
import org.sleuthkit.autopsy.progress.ProgressIndicator;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
import org.sleuthkit.autopsy.textextractors.TextReader;
|
||||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
@ -115,22 +114,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
|
Reader blackboardReader = TextReader
|
||||||
.getContentSpecificExtractor(content, null);
|
.getContentSpecificReader(content, null);
|
||||||
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
|
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||||
ingester.indexText(contentSpecificExtractor, artifact, null);
|
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||||
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
|
ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
|
||||||
|
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
|
Reader contentReader = TextReader
|
||||||
.getContentSpecificExtractor(content, null);
|
.getContentSpecificReader(content, null);
|
||||||
ingester.indexText(contentSpecificExtractor, content, null);
|
ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
|
||||||
} catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
|
} catch (TextReader.NoReaderFoundException | Ingester.IngesterException ex) {
|
||||||
try {
|
try {
|
||||||
// Try the StringsTextExtractor if Tika extractions fails.
|
// Try the StringsTextExtractor if Tika extractions fails.
|
||||||
ingester.indexText(TextExtractorFactory.getDefaultExtractor(null), content, null);
|
ingester.indexText(TextReader.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
|
||||||
} catch (Ingester.IngesterException ex1) {
|
} catch (Ingester.IngesterException ex1) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex1);
|
throw new TskCoreException(ex.getCause().getMessage(), ex1);
|
||||||
}
|
}
|
||||||
@ -444,11 +444,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
|||||||
final Ingester ingester = Ingester.getDefault();
|
final Ingester ingester = Ingester.getDefault();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
TextExtractor<Content> contentSpecificExtractor =
|
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||||
TextExtractorFactory.getContentSpecificExtractor((Content) artifact, null);
|
Reader contentSpecificReader =
|
||||||
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
|
TextReader.getContentSpecificReader((Content) artifact, null);
|
||||||
ingester.indexText(contentSpecificExtractor, artifact, null);
|
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||||
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
|
ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
|
||||||
|
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,19 +21,15 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.logging.Level;
|
|
||||||
import org.apache.tika.parser.txt.CharsetDetector;
|
import org.apache.tika.parser.txt.CharsetDetector;
|
||||||
import org.apache.tika.parser.txt.CharsetMatch;
|
import org.apache.tika.parser.txt.CharsetMatch;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
|
||||||
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
|
|
||||||
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract text from .txt files
|
* Extract text from .txt files
|
||||||
*/
|
*/
|
||||||
final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
|
final class TextFileExtractor {
|
||||||
|
|
||||||
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
|
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
|
||||||
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
|
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
|
||||||
@ -41,47 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
|
|||||||
//This limited information was used to select the current value as one that would filter out clearly non-text
|
//This limited information was used to select the current value as one that would filter out clearly non-text
|
||||||
//files while hopefully working on all files with a valid text encoding
|
//files while hopefully working on all files with a valid text encoding
|
||||||
static final private int MIN_MATCH_CONFIDENCE = 20;
|
static final private int MIN_MATCH_CONFIDENCE = 20;
|
||||||
static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
|
|
||||||
|
|
||||||
@Override
|
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
|
||||||
public boolean isContentTypeSpecific() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Reader getReader(AbstractFile source) throws TextExtractorException {
|
|
||||||
CharsetDetector detector = new CharsetDetector();
|
CharsetDetector detector = new CharsetDetector();
|
||||||
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
|
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
|
||||||
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
|
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
|
||||||
try {
|
try {
|
||||||
detector.setText(stream);
|
detector.setText(stream);
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
||||||
}
|
}
|
||||||
CharsetMatch match = detector.detect();
|
CharsetMatch match = detector.detect();
|
||||||
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
||||||
throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
||||||
}
|
}
|
||||||
|
|
||||||
return match.getReader();
|
return match.getReader();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public class TextFileExtractorException extends Exception {
|
||||||
public boolean isDisabled() {
|
public TextFileExtractorException(String msg, Throwable ex) {
|
||||||
return false;
|
super(msg, ex);
|
||||||
|
}
|
||||||
|
public TextFileExtractorException(String msg) {
|
||||||
|
super(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void logWarning(String msg, Exception ex) {
|
|
||||||
logger.log(Level.WARNING, msg, ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setExtractionSettings(ExtractionContext context) {
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user