mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 10:17:41 +00:00
move all 'appendix' related code into TikaTextExtractor and simplify TextExtractor interface.
This commit is contained in:
parent
8841f6e773
commit
f56c2b43c8
@ -33,7 +33,7 @@ import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifact> {
|
||||
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||
|
||||
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
|
||||
@ -70,10 +70,6 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
Void newAppendixProvider() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(BlackboardArtifact artifact) {
|
||||
@ -118,7 +114,7 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
|
||||
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
|
||||
return new InputStreamReader(stream);
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,8 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
@ -26,7 +28,7 @@ import org.sleuthkit.datamodel.AbstractFile;
|
||||
* Common methods for utilities that extract text and content and divide into
|
||||
* chunks
|
||||
*/
|
||||
abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
|
||||
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
|
||||
|
||||
|
||||
static final List<String> BLOB_MIME_TYPES
|
||||
@ -93,6 +95,9 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
|
||||
*/
|
||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||
|
||||
@Override
|
||||
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
|
||||
|
||||
@Override
|
||||
long getID(AbstractFile source) {
|
||||
return source.getId();
|
||||
@ -103,4 +108,5 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
|
||||
String getName(AbstractFile source) {
|
||||
return source.getName();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
||||
* chunks are indexed with Solr.
|
||||
*/
|
||||
class HtmlTextExtractor extends FileTextExtractor<Void> {
|
||||
class HtmlTextExtractor extends FileTextExtractor {
|
||||
|
||||
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
||||
private static final int MAX_SIZE = 50000000;
|
||||
@ -54,7 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
|
||||
HtmlTextExtractor() {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
@ -76,7 +75,7 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
|
||||
* @throws IOException if There is an IOException parsing the input stream.
|
||||
*/
|
||||
@Override
|
||||
Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException {
|
||||
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
try {
|
||||
StringBuilder scripts = new StringBuilder();
|
||||
StringBuilder links = new StringBuilder();
|
||||
@ -172,10 +171,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
|
||||
}
|
||||
|
||||
@Override
|
||||
Void newAppendixProvider() {
|
||||
return null;
|
||||
}
|
||||
|
||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||
return new ReadContentInputStream(sourceFile1);
|
||||
}
|
||||
|
@ -242,7 +242,7 @@ class Ingester {
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
<A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
final long sourceID = extractor.getID(source);
|
||||
final String sourceName = extractor.getName(source);
|
||||
|
||||
@ -255,18 +255,9 @@ class Ingester {
|
||||
}
|
||||
|
||||
Map<String, String> fields = getContentFields(source);
|
||||
// the appendix will be used to add "meta data" to the end of the last chunk
|
||||
/* JMTODO: we need to figure out how to account for this so the last
|
||||
* chunk doesn't go past 32K
|
||||
*
|
||||
* JM: one idea: push the appendix into the stream that the text
|
||||
* extractor provides so it is automatically chunked with the rest of
|
||||
* the content JMTODO: should this really be in the index at all?
|
||||
*/ A appendix = extractor.newAppendixProvider();
|
||||
|
||||
//Get a stream and a reader for that stream
|
||||
try (final InputStream stream = extractor.getInputStream(source);
|
||||
Reader reader = extractor.getReader(stream, source, appendix);) {
|
||||
Reader reader = extractor.getReader(stream, source);) {
|
||||
|
||||
//we read max 1024 chars at time, this seems to max what some Readers would return
|
||||
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
@ -303,16 +294,8 @@ class Ingester {
|
||||
}
|
||||
}
|
||||
|
||||
StringBuilder sb;
|
||||
if (eof) {
|
||||
//1000 char buffer is to allow for appendix data with out needing to resize the string builder.
|
||||
sb = new StringBuilder(chunkSizeInChars + 1000)
|
||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
||||
extractor.appendDataToFinalChunk(sb, appendix);
|
||||
} else {
|
||||
sb = new StringBuilder(chunkSizeInChars)
|
||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
||||
}
|
||||
StringBuilder sb = new StringBuilder(chunkSizeInChars)
|
||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
||||
|
||||
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
|
||||
|
||||
|
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
//accessed read-only by searcher thread
|
||||
|
||||
private boolean startedSearching = false;
|
||||
private List<FileTextExtractor<?>> textExtractors;
|
||||
private List<FileTextExtractor> textExtractors;
|
||||
private StringsTextExtractor stringExtractor;
|
||||
private final KeywordSearchJobSettings settings;
|
||||
private boolean initialized = false;
|
||||
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
* @throws IngesterException exception thrown if indexing failed
|
||||
*/
|
||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||
FileTextExtractor<?> extractor = null;
|
||||
FileTextExtractor extractor = null;
|
||||
|
||||
//go over available text extractors in order, and pick the first one (most specific one)
|
||||
for (FileTextExtractor<?> fe : textExtractors) {
|
||||
for (FileTextExtractor fe : textExtractors) {
|
||||
if (fe.isSupported(aFile, detectedFormat)) {
|
||||
extractor = fe;
|
||||
break;
|
||||
|
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException;
|
||||
* with the original source file) up to 1MB then and indexes chunks as text with
|
||||
* Solr.
|
||||
*/
|
||||
class StringsTextExtractor extends FileTextExtractor<Void> {
|
||||
class StringsTextExtractor extends FileTextExtractor {
|
||||
/**
|
||||
* Common options that can be used by some extractors
|
||||
*/
|
||||
@ -105,7 +105,7 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException {
|
||||
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||
}
|
||||
|
||||
@ -145,12 +145,6 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
Void newAppendixProvider() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* AbstractFile input string stream reader/converter - given AbstractFile,
|
||||
* extract strings from it and return encoded bytes via read()
|
||||
|
@ -24,7 +24,7 @@ import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
|
||||
abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
|
||||
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
|
||||
abstract boolean noExtractionOptionsAreEnabled();
|
||||
@ -33,15 +33,9 @@ abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisit
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
|
||||
//no-op
|
||||
}
|
||||
|
||||
abstract AppendixProvider newAppendixProvider();
|
||||
|
||||
abstract InputStream getInputStream(TextSource source);
|
||||
|
||||
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
|
||||
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
|
||||
|
||||
abstract long getID(TextSource source);
|
||||
|
||||
|
@ -18,6 +18,8 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import com.google.common.io.CharSource;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
@ -39,16 +41,15 @@ import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||
* divided into chunks and indexed with Solr. Protects against Tika parser hangs
|
||||
* (for unexpected/corrupt content) using a timeout mechanism. If Tika
|
||||
* extraction succeeds, chunks are indexed with Solr.
|
||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text
|
||||
* will be divided into chunks and indexed with Solr. Protects against Tika
|
||||
* parser hangs (for unexpected/corrupt content) using a timeout mechanism. If
|
||||
* Tika extraction succeeds, chunks are indexed with Solr.
|
||||
*
|
||||
* This Tika extraction/chunking utility is useful for large files of Tika
|
||||
* parsers-supported content type.
|
||||
*
|
||||
*/
|
||||
class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
||||
class TikaTextExtractor extends FileTextExtractor {
|
||||
|
||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||
@ -66,27 +67,15 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
||||
}
|
||||
|
||||
@Override
|
||||
Metadata newAppendixProvider() {
|
||||
return new Metadata();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
|
||||
|
||||
//TODO: How do we account for this in chunking algorithm...
|
||||
//JM: what if we always append it as a separate chunk?
|
||||
sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
|
||||
Stream.of(meta.names()).sorted().forEach(key -> {
|
||||
sb.append(key).append(": ").append(meta.get(key)).append("\n");
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException {
|
||||
//Parse the file in a task
|
||||
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
|
||||
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
||||
Metadata metadata = new Metadata();
|
||||
//Parse the file in a task, a convenient way to have a timeout...
|
||||
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
|
||||
try {
|
||||
return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||
final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
|
||||
//concatenate parsed content and meta data into a single reader.
|
||||
return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
|
||||
} catch (TimeoutException te) {
|
||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
|
||||
logWarning(msg, te);
|
||||
@ -99,8 +88,24 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
/**
|
||||
* Get a CharSource that wraps a formated representation of the given
|
||||
* Metadata.
|
||||
*
|
||||
* @param metadata The Metadata to wrap as a CharSource
|
||||
*
|
||||
* @returna CharSource for the given MetaData
|
||||
*/
|
||||
static private CharSource getMetaDataCharSource(Metadata metadata) {
|
||||
return CharSource.wrap(
|
||||
new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
|
||||
.append(Stream.of(metadata.names()).sorted()
|
||||
.map(key -> key + ": " + metadata.get(key))
|
||||
.collect(Collectors.joining("\n"))
|
||||
));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
@ -130,8 +135,9 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
||||
boolean noExtractionOptionsAreEnabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* return timeout that should be used to index the content
|
||||
* Return timeout that should be used to index the content.
|
||||
*
|
||||
* @param size size of the content
|
||||
*
|
||||
@ -152,4 +158,22 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* An implementation of CharSource that just wraps an existing reader and
|
||||
* returns it in openStream().
|
||||
*/
|
||||
private static class ReaderCharSource extends CharSource {
|
||||
|
||||
private final Reader reader;
|
||||
|
||||
public ReaderCharSource(Reader reader) {
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader openStream() throws IOException {
|
||||
return reader;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user