Complete overhaul of how extractors are currently implemented, hopefully converging to a more sensible solution

This commit is contained in:
U-BASIS\dsmyda 2018-12-07 13:26:52 -05:00
parent ce548fb978
commit ece50a3a00
12 changed files with 241 additions and 530 deletions

View File

@ -22,7 +22,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.logging.Level;
import org.apache.commons.io.IOUtils;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
@ -35,39 +34,27 @@ import org.sleuthkit.datamodel.TskCoreException;
* Extracts text from artifacts by concatenating the values of all of the
* artifact's attributes.
*/
class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
class ArtifactTextExtractor<T extends BlackboardArtifact> implements TextExtractor<T> {
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
@Override
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
private InputStream getInputStream(Content artifact) throws TextExtractorException {
BlackboardArtifact art = (BlackboardArtifact)artifact;
private InputStream getInputStream(BlackboardArtifact artifact) throws InitReaderException {
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
Content dataSource = null;
try {
dataSource = art.getDataSource();
dataSource = artifact.getDataSource();
} catch (TskCoreException tskCoreException) {
throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
throw new InitReaderException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
}
if (dataSource == null) {
throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
throw new InitReaderException("Datasource was null for artifact: " + artifact.toString());
}
try {
for (BlackboardAttribute attribute : art.getAttributes()) {
for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : ");
// We have also discussed modifying BlackboardAttribute.getDisplayString()
@ -85,40 +72,31 @@ class ArtifactTextExtractor<T extends Content> extends ContentTextExtractor<T> {
artifactContents.append(System.lineSeparator());
}
} catch (TskCoreException tskCoreException) {
throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
throw new InitReaderException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
}
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
}
@Override
public Reader getReader(Content source) throws TextExtractorException {
public Reader getReader(BlackboardArtifact source) throws InitReaderException {
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
}
@Override
public long getID(Content source) {
BlackboardArtifact art = (BlackboardArtifact)source;
return art.getArtifactID();
}
@Override
public String getName(Content source) {
BlackboardArtifact art = (BlackboardArtifact)source;
return art.getDisplayName() + "_" + art.getArtifactID();
}
/**
* Configures this extractors to the settings stored in relevant config instances.
*
* This operation is a no-op since currently there are no configurable settings
* of the extraction process.
*
* @param context Instance containing file config settings
*/
@Override
public void setExtractionSettings(ExtractionContext context) {
}
@Override
public boolean isContentTypeSpecific() {
return true;
}
@Override
public boolean isSupported(Content file, String detectedFormat) {
public boolean isSupported(BlackboardArtifact file, String detectedFormat) {
return true;
}
}

View File

@ -1,130 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors;
import com.google.common.collect.ImmutableList;
import java.io.Reader;
import java.util.List;
import org.sleuthkit.datamodel.Content;
/**
* Common methods for utilities that extract text and content and divide into
* chunks
* @param <T>
*/
public abstract class ContentTextExtractor<T extends Content> implements TextExtractor<T> {
//Mimetype groups to aassist extractor implementations in ignoring binary and
//archive files.
public static final List<String> BINARY_MIME_TYPES
= ImmutableList.of(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
public static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/**
* Determines if the extractor works only for specified types is
* supportedTypes() or whether is a generic content extractor (such as
* string extractor)
*
* @return
*/
public abstract boolean isContentTypeSpecific();
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
*
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/
public abstract boolean isSupported(T file, String detectedFormat);
/**
* Returns a reader that will iterate over the text of the source content.
*
* @param source Content source to read
* @return A reader that contains all source text
* @throws TextExtractorException Error encountered during extraction
*/
@Override
public abstract Reader getReader(T source) throws TextExtractorException;
/**
* Get the object id of the content source.
*
* @param source source content
* @return object id associated with this source content
*/
@Override
public long getID(T source) {
return source.getId();
}
/**
* Returns the human-readable name of the given content source.
*
* @param source source content
* @return name of source content
*/
@Override
public String getName(T source) {
return source.getName();
}
}

View File

@ -23,7 +23,6 @@ import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Config;
import net.htmlparser.jericho.LoggerProvider;
@ -33,13 +32,12 @@ import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extracts text from HTML content.
*/
final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T> {
final class HtmlTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private final int MAX_SIZE;
@ -67,19 +65,6 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
MAX_SIZE = 50_000_000;
}
/**
* Determines if this extractor is responsible for extracting only a
* specific type of media.
*
* In this case, only HTML documents can be read successfully.
*
* @return true
*/
@Override
public boolean isContentTypeSpecific() {
return true;
}
/**
* Determines if this content type is supported by this extractor.
*
@ -89,7 +74,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
* @return flag indicating support
*/
@Override
public boolean isSupported(Content content, String detectedFormat) {
public boolean isSupported(AbstractFile content, String detectedFormat) {
return detectedFormat != null
&& WEB_MIME_TYPES.contains(detectedFormat)
&& content.getSize() <= MAX_SIZE;
@ -105,7 +90,7 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
* @throws TextExtractorException
*/
@Override
public Reader getReader(Content content) throws TextExtractorException {
public Reader getReader(AbstractFile content) throws InitReaderException {
//TODO JIRA-4467, there is only harm in excluding HTML documents greater
//than 50MB due to our troubled approach of extraction.
ReadContentInputStream stream = new ReadContentInputStream(content);
@ -201,25 +186,10 @@ final class HtmlTextExtractor<T extends Content> extends ContentTextExtractor<T>
// All done, now make it a reader
return new StringReader(stringBuilder.toString());
} catch (IOException ex) {
throw new TextExtractorException("Error extracting HTML from content.", ex);
throw new InitReaderException("Error extracting HTML from content.", ex);
}
}
/**
* Indicates if this extractor can run.
*
* @return Flag indicating if this extractor can run.
*/
@Override
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
/**
* Determines how the extraction process will proceed given the settings
* stored in this context instance.

View File

@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
/**
* Extracts text from SQLite database files.
@ -39,48 +38,10 @@ import org.sleuthkit.datamodel.Content;
* 2) Tables that contain spaces in their name are not extracted
* 3) Table names are not included in its output text
*/
final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<T> {
final class SqliteTextExtractor<T extends AbstractFile> implements TextExtractor<T> {
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
private static boolean isDisabled;
static {
try {
Class.forName("org.sqlite.JDBC");
isDisabled = false;
} catch (ClassNotFoundException ex) {
logger.log(Level.SEVERE, "Sqlite JDBC class could not be found, "
+ "SqliteTextExtractor is automatically disabling.", ex); //NON-NLS
isDisabled = true;
}
}
/**
* This extractor only works for sqlite files, so it is indeed content type
* specific.
*
* @return true
*/
@Override
public boolean isContentTypeSpecific() {
return true;
}
/**
* Determines if this extractor is fit to run.
*
* @return Flag indicating if it should or shouldn't be run.
*/
@Override
public boolean isDisabled() {
return isDisabled;
}
@Override
public void logWarning(String msg, Exception exception) {
logger.log(Level.WARNING, msg, exception); //NON-NLS
}
/**
* Supports only the sqlite mimetypes
@ -91,7 +52,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
* @return true if x-sqlite3
*/
@Override
public boolean isSupported(Content file, String detectedFormat) {
public boolean isSupported(AbstractFile file, String detectedFormat) {
return SQLITE_MIMETYPE.equals(detectedFormat);
}
@ -105,12 +66,8 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
* @throws TextExtractorException
*/
@Override
public Reader getReader(Content source) throws TextExtractorException {
if(source instanceof AbstractFile) {
return new SQLiteStreamReader((AbstractFile)source);
}
throw new TextExtractorException(String.format("Source content with name [%s] and id=[%d] was not of type"
+ " AbstractFile.", source.getName(), source.getId()));
public Reader getReader(AbstractFile source) throws InitReaderException {
return new SQLiteStreamReader(source);
}
/**
@ -125,7 +82,7 @@ final class SqliteTextExtractor<T extends Content> extends ContentTextExtractor<
@Override
public void setExtractionSettings(ExtractionContext context) {
}
/**
* Produces a continuous stream of characters from a database file. To
* achieve this, all table names are queues up and a SQLiteTableReader is

View File

@ -25,7 +25,6 @@ import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@ -37,41 +36,12 @@ import org.sleuthkit.datamodel.TskException;
/**
* Extracts raw strings from content.
*/
final class StringsTextExtractor<T extends Content> extends ContentTextExtractor<T> {
final class StringsTextExtractor {
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
private boolean extractUTF8;
private boolean extractUTF16;
private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
/**
* Determines if this extractor may only read particular types of content.
*
* Since Strings may be run on any content type, it is not content specific.
*
* @return false
*/
@Override
public boolean isContentTypeSpecific() {
return false;
}
/**
* Determines if this extractor can read the content type.
*
* Note: Strings can be run on any type of content, so all types will return
* true.
*
* @param file Content source to read
* @param detectedFormat Mimetype of source file.
*
* @return true
*/
@Override
public boolean isSupported(Content file, String detectedFormat) {
return true;
}
private final List<SCRIPT> extractScripts = new ArrayList<>();
/**
@ -99,33 +69,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
this.extractScripts.addAll(extractScripts);
}
/**
* Get the currently used scripts for extraction
*
* @return scripts currently used or null if not supported
*/
public List<SCRIPT> getScripts() {
return new ArrayList<>(extractScripts);
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
/**
* Determines if this extractor should be run or not.
*
* Atleast one of the extraction encodings in DefaultExtractionConfig must
* be set for this extractor to run.
*
* @return Flag indicating if this extractor should be run.
*/
@Override
public boolean isDisabled() {
return extractUTF8 == false && extractUTF16 == false;
}
/**
* Returns a reader that will iterate over the text of the content source.
*
@ -136,8 +79,7 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
* @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/
@Override
public InputStreamReader getReader(Content content) throws TextExtractorException {
public InputStreamReader getReader(Content content) {
InputStream stringStream = getInputStream(content);
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
}
@ -160,7 +102,6 @@ final class StringsTextExtractor<T extends Content> extends ContentTextExtractor
*
* @param context Instance containing config classes
*/
@Override
public void setExtractionSettings(ExtractionContext context) {
if (context != null && context.contains(DefaultExtractionConfig.class)) {
DefaultExtractionConfig configInstance = context.get(DefaultExtractionConfig.class);

View File

@ -19,7 +19,6 @@
package org.sleuthkit.autopsy.textextractors;
import java.io.Reader;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
/**
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
@ -28,23 +27,19 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
* @param <T> The subtype of SleuthkitVisitableItem an implementation is able to
* process.
*/
public interface TextExtractor<T extends SleuthkitVisitableItem> {
/**
* Is this extractor configured such that no extraction will/should be done?
interface TextExtractor<T> {
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
*
* @return True if this extractor will/should not perform any extraction.
*/
boolean isDisabled();
/**
* Log the given message and exception as a warning.
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @param msg Log message
* @param ex Exception associated with the incoming message
* @return true if the file content is supported, false otherwise
*/
void logWarning(String msg, Exception ex);
public abstract boolean isSupported(T file, String detectedFormat);
/**
* Get a reader that will iterate over the text extracted from the given
* source.
@ -53,28 +48,8 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
*
* @return Reader instance that contains the text of the source
*
* @throws TextExtractorException
*/
Reader getReader(T source) throws TextExtractorException;
/**
* Get the 'object' id of the given source.
*
* @param source Source content of type T
*
* @return Object id of the source content
*/
long getID(T source);
/**
* Get a human readable name for the given source.
*
* @param source Source content of type T
*
* @return Name of the content source
*/
String getName(T source);
Reader getReader(T source) throws InitReaderException;
/**
* Determines how the extraction process will proceed given the settings
@ -85,18 +60,18 @@ public interface TextExtractor<T extends SleuthkitVisitableItem> {
* @param context Instance containing file config classes
*/
void setExtractionSettings(ExtractionContext context);
/**
* System exception for dealing with errors encountered during extraction.
*/
class TextExtractorException extends Exception {
public TextExtractorException(String message) {
super(message);
public class InitReaderException extends Exception {
public InitReaderException(String msg, Throwable ex) {
super(msg, ex);
}
public TextExtractorException(String message, Throwable cause) {
super(message, cause);
public InitReaderException(Throwable ex) {
super(ex);
}
public InitReaderException(String msg) {
super(msg);
}
}
}

View File

@ -18,6 +18,7 @@
*/
package org.sleuthkit.autopsy.textextractors;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
import org.sleuthkit.datamodel.AbstractFile;
@ -31,8 +32,13 @@ import org.sleuthkit.datamodel.Report;
* See ContentTextExtractor interface for the generic structure of such
* extractors.
*/
public class TextExtractorFactory {
public class TextReader {
private final static List<TextExtractor<AbstractFile>> fileExtractors = Arrays.asList(
new HtmlTextExtractor<>(),
new SqliteTextExtractor<>(),
new TikaTextExtractor<>()
);
/**
* Auto detects the correct text extractor given the file.
*
@ -41,40 +47,42 @@ public class TextExtractorFactory {
* will keep the extractors at default settings. Refer to the
* extractionconfigs package for available file configurations.
*
* @param <T> Type of source content
* @param file Content source that will be read from
* @param context Contains extraction configurations for certain file types
*
* @return A ContentTextExtractor instance that is properly configured and
* can be read from the getReader() method.
*
* @throws NoContentSpecificExtractorException In the event that the
* @throws NoReaderFoundException In the event that the
* inputted file and mimetype
* have no corresponding
* extractor
*/
public static <T extends Content> ContentTextExtractor<T> getContentSpecificExtractor(T file,
ExtractionContext context) throws NoContentSpecificExtractorException {
if (file instanceof AbstractFile) {
List<ContentTextExtractor<T>> fileExtractors = getAbstractFileExtractors();
String mimeType = ((AbstractFile) file).getMIMEType();
for (ContentTextExtractor<T> candidate : fileExtractors) {
candidate.setExtractionSettings(context);
if (candidate.isSupported(file, mimeType)) {
return candidate;
public static Reader getContentSpecificReader(Content file,
ExtractionContext context) throws NoReaderFoundException {
try {
if (file instanceof AbstractFile) {
String mimeType = ((AbstractFile) file).getMIMEType();
for (TextExtractor<AbstractFile> candidate : fileExtractors) {
candidate.setExtractionSettings(context);
if (candidate.isSupported((AbstractFile)file, mimeType)) {
return candidate.getReader((AbstractFile)file);
}
}
} else if (file instanceof BlackboardArtifact) {
TextExtractor<BlackboardArtifact> artifactExtractor = new ArtifactTextExtractor<>();
artifactExtractor.setExtractionSettings(context);
return artifactExtractor.getReader((BlackboardArtifact)file);
} else if (file instanceof Report) {
TextExtractor<Report> reportExtractor = new TikaTextExtractor<>();
reportExtractor.setExtractionSettings(context);
reportExtractor.getReader((Report)file);
}
} else if (file instanceof BlackboardArtifact) {
ContentTextExtractor<T> artifactExtractor = new ArtifactTextExtractor<>();
artifactExtractor.setExtractionSettings(context);
return artifactExtractor;
} else if (file instanceof Report) {
ContentTextExtractor<T> reportExtractor = new TikaTextExtractor<>();
reportExtractor.setExtractionSettings(context);
return reportExtractor;
} catch (TextExtractor.InitReaderException ex) {
throw new NoReaderFoundException(ex);
}
throw new NoContentSpecificExtractorException(
throw new NoReaderFoundException(
String.format("Could not find a suitable extractor for "
+ "file with name [%s] and id=[%d]. Try using the default, "
+ "non content specific extractor as an alternative.",
@ -82,43 +90,34 @@ public class TextExtractorFactory {
);
}
/**
* Instantiates and returns a list of all of the known abstract file
* extractors.
*
* @return A list of specialized ContentTextExtractors
*/
private static <T extends Content> List<ContentTextExtractor<T>> getAbstractFileExtractors() {
return Arrays.asList(
new HtmlTextExtractor<>(),
new SqliteTextExtractor<>(),
new TikaTextExtractor<>()
);
}
/**
* Returns the default extractor that can be run on any content type. This
* extractor should be used as a backup in the event that no specialized
* extractor can be found.
*
* @param source
* @param context Contains extraction configurations for certain file types
*
* @return A DefaultExtractor instance
*/
public static ContentTextExtractor<Content> getDefaultExtractor(ExtractionContext context) {
ContentTextExtractor<Content> stringsInstance = new StringsTextExtractor<>();
public static Reader getDefaultReader(Content source, ExtractionContext context) {
StringsTextExtractor stringsInstance = new StringsTextExtractor();
stringsInstance.setExtractionSettings(context);
return stringsInstance;
return stringsInstance.getReader(source);
}
/**
* System level exception for handling content types that have no specific
* strategy defined for extracting their text.
*/
public static class NoContentSpecificExtractorException extends Exception {
public static class NoReaderFoundException extends Exception {
public NoContentSpecificExtractorException(String msg) {
public NoReaderFoundException(String msg) {
super(msg);
}
public NoReaderFoundException(Throwable ex) {
super(ex);
}
}
}

View File

@ -18,6 +18,7 @@
*/
package org.sleuthkit.autopsy.textextractors;
import com.google.common.collect.ImmutableList;
import com.google.common.io.CharSource;
import java.io.File;
import java.io.IOException;
@ -54,7 +55,53 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
* Extracts text from Tika supported content. Protects against Tika
* parser hangs (for unexpected/corrupt content) using a timeout mechanism.
*/
final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T> {
final class TikaTextExtractor<T extends Content> implements TextExtractor<T> {
//Mimetype groups to aassist extractor implementations in ignoring binary and
//archive files.
private static final List<String> BINARY_MIME_TYPES
= ImmutableList.of(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
private static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
@ -74,11 +121,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
.map(mt -> mt.getType() + "/" + mt.getSubtype())
.collect(Collectors.toList());
@Override
public void logWarning(final String msg, Exception ex) {
tikaLogger.log(Level.WARNING, msg, ex);
}
/**
* Returns a reader that will iterate over the text extracted from Apache
* Tika.
@ -89,7 +131,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
* @throws org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/
@Override
public Reader getReader(Content content) throws TextExtractorException {
public Reader getReader(Content content) throws InitReaderException {
ReadContentInputStream stream = new ReadContentInputStream(content);
Metadata metadata = new Metadata();
@ -136,7 +178,7 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
PushbackReader pushbackReader = new PushbackReader(tikaReader);
int read = pushbackReader.read();
if (read == -1) {
throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
throw new InitReaderException("Unable to extract text: Tika returned empty reader for " + content);
}
pushbackReader.unread(read);
@ -145,15 +187,13 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
} catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
logWarning(msg, te);
throw new TextExtractorException(msg, te);
} catch (TextExtractorException ex) {
throw new InitReaderException(msg, te);
} catch (InitReaderException ex) {
throw ex;
} catch (Exception ex) {
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
logWarning(msg, ex);
throw new TextExtractorException(msg, ex);
throw new InitReaderException(msg, ex);
} finally {
future.cancel(true);
}
@ -199,19 +239,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
));
}
/**
* Determines if this extractor only understands a specifc type of content.
*
* Although Apache Tika is defined for many input types, it is still a content
* specific approach to extraction.
*
* @return true
*/
@Override
public boolean isContentTypeSpecific() {
return true;
}
/**
* Determines if Tika is supported for this content type and mimetype.
*
@ -222,8 +249,8 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
@Override
public boolean isSupported(Content content, String detectedFormat) {
if (detectedFormat == null
|| ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|| ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|| BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|| ARCHIVE_MIME_TYPES.contains(detectedFormat)
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
) {
@ -232,19 +259,6 @@ final class TikaTextExtractor<T extends Content> extends ContentTextExtractor<T>
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
}
/**
* Determines if this extractor can be run.
*
* So long as Tika's dependencies are present, this extractor can run
* no matter the circumstance.
*
* @return true
*/
@Override
public boolean isDisabled() {
return false;
}
/**
* Return timeout that should be used to index the content.
*

View File

@ -19,6 +19,7 @@
package org.sleuthkit.autopsy.keywordsearch;
import java.io.BufferedReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
@ -32,7 +33,6 @@ import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content;
@ -106,8 +106,8 @@ class Ingester {
* @throws IngesterException if there was an error processing a specific
* artifact, but the Solr server is probably fine.
*/
void indexMetaDataOnly(BlackboardArtifact artifact, TextExtractor<Content> extractor) throws IngesterException {
indexChunk("", extractor.getName(artifact), getContentFields(artifact));
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
indexChunk("", sourceName, getContentFields(artifact));
}
/**
@ -142,23 +142,12 @@ class Ingester {
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
// TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
final long sourceID = extractor.getID(source);
final String sourceName = extractor.getName(source);
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
int numChunks = 0; //unknown until chunking is done
if (extractor.isDisabled()) {
/*
* some Extractors, notable the strings extractor, have options
* which can be configured such that no extraction should be done
*/
return true;
}
Map<String, String> fields = getContentFields(source);
//Get a reader for the content of the given source
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
try (BufferedReader reader = new BufferedReader(sourceReader)) {
Chunker chunker = new Chunker(reader);
for (Chunk chunk : chunker) {
if (context != null && context.fileIngestIsCancelled()) {
@ -173,18 +162,18 @@ class Ingester {
indexChunk(chunk.toString(), sourceName, fields);
numChunks++;
} catch (Ingester.IngesterException ingEx) {
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
throw ingEx; //need to rethrow to signal error and move on
}
}
if (chunker.hasException()) {
extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
return false;
}
} catch (Exception ex) {
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} finally {
if (context != null && context.fileIngestIsCancelled()) {

View File

@ -18,12 +18,14 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
import com.google.common.collect.ImmutableList;
import java.io.Reader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import org.openide.util.Exceptions;
import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages;
import org.sleuthkit.autopsy.casemodule.Case;
@ -37,16 +39,15 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.autopsy.textextractors.TextReader;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
import org.sleuthkit.autopsy.textextractors.extractionconfigs.DefaultExtractionConfig;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown;
@ -68,6 +69,43 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
})
public final class KeywordSearchIngestModule implements FileIngestModule {
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
public static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/**
* Options for this extractor
*/
@ -104,7 +142,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//accessed read-only by searcher thread
private boolean startedSearching = false;
private TextExtractor<Content> stringExtractor;
private ExtractionContext stringsExtractionContext;
private final KeywordSearchJobSettings settings;
private boolean initialized = false;
private long jobId;
@ -250,7 +288,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
}
}
ExtractionContext extractionContext = new ExtractionContext();
stringsExtractionContext = new ExtractionContext();
DefaultExtractionConfig stringsConfig = new DefaultExtractionConfig();
Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
@ -258,9 +296,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
stringsConfig.setExtractScripts(KeywordSearchSettings.getStringExtractScripts());
extractionContext.set(DefaultExtractionConfig.class, stringsConfig);
stringsExtractionContext.set(DefaultExtractionConfig.class, stringsConfig);
stringExtractor = TextExtractorFactory.getDefaultExtractor(extractionContext);
indexer = new Indexer();
initialized = true;
}
@ -352,7 +389,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* Common cleanup code when module stops or final searcher completes
*/
private void cleanup() {
stringExtractor = null;
stringsExtractionContext = null;
initialized = false;
}
@ -440,7 +477,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* @throws IngesterException exception thrown if indexing failed
*/
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
TextExtractor<Content> extractor = null;
ExtractionContext extractionContext = new ExtractionContext();
ImageFileExtractionConfig imageConfig = new ImageFileExtractionConfig();
@ -448,10 +484,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
extractionContext.set(ImageFileExtractionConfig.class, imageConfig);
try {
extractor = TextExtractorFactory.getContentSpecificExtractor(aFile,extractionContext);
Reader specializedReader = TextReader.getContentSpecificReader(aFile,extractionContext);
//divide into chunks and index
return Ingester.getDefault().indexText(extractor, aFile, context);
} catch (TextExtractorFactory.NoContentSpecificExtractorException ex) {
return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
} catch (TextReader.NoReaderFoundException ex) {
//No text extractor found... run the default instead
return false;
}
@ -470,7 +506,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) {
return true;
}
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
Reader stringsReader = TextReader.getDefaultReader(aFile, stringsExtractionContext);
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
return true;
} else {
@ -530,7 +567,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
// we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though...
if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
if (ARCHIVE_MIME_TYPES.contains(fileType)) {
try {
if (context.fileIngestIsCancelled()) {
return;
@ -579,11 +616,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
try {
TextFileExtractor textFileExtractor = new TextFileExtractor();
if (Ingester.getDefault().indexText(textFileExtractor, aFile, context)) {
Reader textReader = textFileExtractor.getReader(aFile);
if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
wasTextAdded = true;
}
} catch (IngesterException ex) {
} catch (IngesterException | TextFileExtractorException ex) {
logger.log(Level.WARNING, "Unable to index as unicode", ex);
}
}

View File

@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.net.InetAddress;
import java.util.ArrayList;
@ -33,7 +34,6 @@ import org.apache.commons.lang.math.NumberUtils;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.openide.util.Exceptions;
import org.openide.util.NbBundle;
import org.openide.util.lookup.ServiceProvider;
import org.openide.util.lookup.ServiceProviders;
@ -46,8 +46,7 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
import org.sleuthkit.autopsy.progress.ProgressIndicator;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.autopsy.textextractors.TextReader;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
@ -115,22 +114,23 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
return;
}
try {
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
.getContentSpecificExtractor(content, null);
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
ingester.indexText(contentSpecificExtractor, artifact, null);
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
Reader blackboardReader = TextReader
.getContentSpecificReader(content, null);
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
ingester.indexMetaDataOnly(artifact, sourceName);
ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex);
}
} else {
try {
TextExtractor<Content> contentSpecificExtractor = TextExtractorFactory
.getContentSpecificExtractor(content, null);
ingester.indexText(contentSpecificExtractor, content, null);
} catch (TextExtractorFactory.NoContentSpecificExtractorException | Ingester.IngesterException ex) {
Reader contentReader = TextReader
.getContentSpecificReader(content, null);
ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
} catch (TextReader.NoReaderFoundException | Ingester.IngesterException ex) {
try {
// Try the StringsTextExtractor if Tika extractions fails.
ingester.indexText(TextExtractorFactory.getDefaultExtractor(null), content, null);
ingester.indexText(TextReader.getDefaultReader(content, null),content.getId(),content.getName(), content, null);
} catch (Ingester.IngesterException ex1) {
throw new TskCoreException(ex.getCause().getMessage(), ex1);
}
@ -444,11 +444,12 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
final Ingester ingester = Ingester.getDefault();
try {
TextExtractor<Content> contentSpecificExtractor =
TextExtractorFactory.getContentSpecificExtractor((Content) artifact, null);
ingester.indexMetaDataOnly(artifact, contentSpecificExtractor);
ingester.indexText(contentSpecificExtractor, artifact, null);
} catch (Ingester.IngesterException | TextExtractorFactory.NoContentSpecificExtractorException ex) {
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
Reader contentSpecificReader =
TextReader.getContentSpecificReader((Content) artifact, null);
ingester.indexMetaDataOnly(artifact, sourceName);
ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
} catch (Ingester.IngesterException | TextReader.NoReaderFoundException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex);
}
}

View File

@ -21,19 +21,15 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.Reader;
import java.util.logging.Level;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.textextractors.ContentTextExtractor;
import org.sleuthkit.autopsy.textextractors.ExtractionContext;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extract text from .txt files
*/
final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
final class TextFileExtractor {
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
@ -41,47 +37,30 @@ final class TextFileExtractor extends ContentTextExtractor<AbstractFile> {
//This limited information was used to select the current value as one that would filter out clearly non-text
//files while hopefully working on all files with a valid text encoding
static final private int MIN_MATCH_CONFIDENCE = 20;
static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
@Override
public boolean isContentTypeSpecific() {
return true;
}
@Override
public boolean isSupported(AbstractFile file, String detectedFormat) {
return true;
}
@Override
public Reader getReader(AbstractFile source) throws TextExtractorException {
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
CharsetDetector detector = new CharsetDetector();
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
try {
detector.setText(stream);
} catch (IOException ex) {
throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
}
CharsetMatch match = detector.detect();
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
}
return match.getReader();
}
@Override
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex);
}
@Override
public void setExtractionSettings(ExtractionContext context) {
public class TextFileExtractorException extends Exception {
public TextFileExtractorException(String msg, Throwable ex) {
super(msg, ex);
}
public TextFileExtractorException(String msg) {
super(msg);
}
}
}