move all 'appendix' related code into TikaTextExtractor and simplify TextExtractor interface.

This commit is contained in:
millmanorama 2016-12-16 14:24:01 +01:00
parent 8841f6e773
commit f56c2b43c8
8 changed files with 74 additions and 82 deletions

View File

@ -33,7 +33,7 @@ import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException;
public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifact> {
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
@ -70,10 +70,6 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
return false;
}
@Override
Void newAppendixProvider() {
return null;
}
@Override
InputStream getInputStream(BlackboardArtifact artifact) {
@ -118,7 +114,7 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
}
@Override
Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
return new InputStreamReader(stream);
}

View File

@ -18,6 +18,8 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
import org.sleuthkit.datamodel.AbstractFile;
@ -26,7 +28,7 @@ import org.sleuthkit.datamodel.AbstractFile;
* Common methods for utilities that extract text and content and divide into
* chunks
*/
abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
static final List<String> BLOB_MIME_TYPES
@ -93,6 +95,9 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
*/
abstract boolean isSupported(AbstractFile file, String detectedFormat);
@Override
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
@Override
long getID(AbstractFile source) {
return source.getId();
@ -103,4 +108,5 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
String getName(AbstractFile source) {
return source.getName();
}
}

View File

@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
* chunks are indexed with Solr.
*/
class HtmlTextExtractor extends FileTextExtractor<Void> {
class HtmlTextExtractor extends FileTextExtractor {
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
private static final int MAX_SIZE = 50000000;
@ -54,7 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
HtmlTextExtractor() {
}
@Override
boolean isContentTypeSpecific() {
return true;
@ -76,7 +75,7 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
* @throws IOException if There is an IOException parsing the input stream.
*/
@Override
Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException {
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
try {
StringBuilder scripts = new StringBuilder();
StringBuilder links = new StringBuilder();
@ -172,10 +171,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
}
@Override
Void newAppendixProvider() {
return null;
}
InputStream getInputStream(AbstractFile sourceFile1) {
return new ReadContentInputStream(sourceFile1);
}

View File

@ -242,7 +242,7 @@ class Ingester {
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
<A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
final long sourceID = extractor.getID(source);
final String sourceName = extractor.getName(source);
@ -255,18 +255,9 @@ class Ingester {
}
Map<String, String> fields = getContentFields(source);
// the appendix will be used to add "meta data" to the end of the last chunk
/* JMTODO: we need to figure out how to account for this so the last
* chunk doesn't go past 32K
*
* JM: one idea: push the appendix into the stream that the text
* extractor provides so it is automatically chunked with the rest of
* the content JMTODO: should this really be in the index at all?
*/ A appendix = extractor.newAppendixProvider();
//Get a stream and a reader for that stream
try (final InputStream stream = extractor.getInputStream(source);
Reader reader = extractor.getReader(stream, source, appendix);) {
Reader reader = extractor.getReader(stream, source);) {
//we read max 1024 chars at time, this seems to max what some Readers would return
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
@ -303,16 +294,8 @@ class Ingester {
}
}
StringBuilder sb;
if (eof) {
//1000 char buffer is to allow for appendix data with out needing to resize the string builder.
sb = new StringBuilder(chunkSizeInChars + 1000)
StringBuilder sb = new StringBuilder(chunkSizeInChars)
.append(textChunkBuf, 0, chunkSizeInChars);
extractor.appendDataToFinalChunk(sb, appendix);
} else {
sb = new StringBuilder(chunkSizeInChars)
.append(textChunkBuf, 0, chunkSizeInChars);
}
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'

View File

@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//accessed read-only by searcher thread
private boolean startedSearching = false;
private List<FileTextExtractor<?>> textExtractors;
private List<FileTextExtractor> textExtractors;
private StringsTextExtractor stringExtractor;
private final KeywordSearchJobSettings settings;
private boolean initialized = false;
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* @throws IngesterException exception thrown if indexing failed
*/
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
FileTextExtractor<?> extractor = null;
FileTextExtractor extractor = null;
//go over available text extractors in order, and pick the first one (most specific one)
for (FileTextExtractor<?> fe : textExtractors) {
for (FileTextExtractor fe : textExtractors) {
if (fe.isSupported(aFile, detectedFormat)) {
extractor = fe;
break;

View File

@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException;
* with the original source file) up to 1MB then and indexes chunks as text with
* Solr.
*/
class StringsTextExtractor extends FileTextExtractor<Void> {
class StringsTextExtractor extends FileTextExtractor {
/**
* Common options that can be used by some extractors
*/
@ -105,7 +105,7 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
}
@Override
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException {
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
}
@ -145,12 +145,6 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
return true;
}
@Override
Void newAppendixProvider() {
return null;
}
/**
* AbstractFile input string stream reader/converter - given AbstractFile,
* extract strings from it and return encoded bytes via read()

View File

@ -24,7 +24,7 @@ import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
abstract boolean noExtractionOptionsAreEnabled();
@ -33,15 +33,9 @@ abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisit
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
//no-op
}
abstract AppendixProvider newAppendixProvider();
abstract InputStream getInputStream(TextSource source);
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
abstract long getID(TextSource source);

View File

@ -18,6 +18,8 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import com.google.common.io.CharSource;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.List;
@ -39,16 +41,15 @@ import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
* divided into chunks and indexed with Solr. Protects against Tika parser hangs
* (for unexpected/corrupt content) using a timeout mechanism. If Tika
* extraction succeeds, chunks are indexed with Solr.
* Extractor of text from TIKA supported AbstractFile content. Extracted text
* will be divided into chunks and indexed with Solr. Protects against Tika
* parser hangs (for unexpected/corrupt content) using a timeout mechanism. If
* Tika extraction succeeds, chunks are indexed with Solr.
*
* This Tika extraction/chunking utility is useful for large files of Tika
* parsers-supported content type.
*
*/
class TikaTextExtractor extends FileTextExtractor<Metadata> {
class TikaTextExtractor extends FileTextExtractor {
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
@ -66,27 +67,15 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
}
@Override
Metadata newAppendixProvider() {
return new Metadata();
}
@Override
public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
//TODO: How do we account for this in chunking algorithm...
//JM: what if we always append it as a separate chunk?
sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
Stream.of(meta.names()).sorted().forEach(key -> {
sb.append(key).append(": ").append(meta.get(key)).append("\n");
});
}
@Override
Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException {
//Parse the file in a task
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
Metadata metadata = new Metadata();
//Parse the file in a task, a convenient way to have a timeout...
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
try {
return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
//concatenate parsed content and meta data into a single reader.
return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
} catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
logWarning(msg, te);
@ -99,8 +88,24 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
}
}
@Override
/**
* Get a CharSource that wraps a formated representation of the given
* Metadata.
*
* @param metadata The Metadata to wrap as a CharSource
*
* @returna CharSource for the given MetaData
*/
static private CharSource getMetaDataCharSource(Metadata metadata) {
return CharSource.wrap(
new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
.append(Stream.of(metadata.names()).sorted()
.map(key -> key + ": " + metadata.get(key))
.collect(Collectors.joining("\n"))
));
}
@Override
public boolean isContentTypeSpecific() {
return true;
}
@ -130,8 +135,9 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
boolean noExtractionOptionsAreEnabled() {
return false;
}
/**
* return timeout that should be used to index the content
* Return timeout that should be used to index the content.
*
* @param size size of the content
*
@ -152,4 +158,22 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
}
}
/**
* An implementation of CharSource that just wraps an existing reader and
* returns it in openStream().
*/
private static class ReaderCharSource extends CharSource {
private final Reader reader;
public ReaderCharSource(Reader reader) {
this.reader = reader;
}
@Override
public Reader openStream() throws IOException {
return reader;
}
}
}