mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
move all 'appendix' related code into TikaTextExtractor and simplify TextExtractor interface.
This commit is contained in:
parent
8841f6e773
commit
f56c2b43c8
@ -33,7 +33,7 @@ import org.sleuthkit.datamodel.Content;
|
|||||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
import org.sleuthkit.datamodel.SleuthkitCase;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifact> {
|
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
||||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||||
|
|
||||||
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
|
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
|
||||||
@ -70,10 +70,6 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
Void newAppendixProvider() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
InputStream getInputStream(BlackboardArtifact artifact) {
|
InputStream getInputStream(BlackboardArtifact artifact) {
|
||||||
@ -118,7 +114,7 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
|
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
|
||||||
return new InputStreamReader(stream);
|
return new InputStreamReader(stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,8 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
@ -26,7 +28,7 @@ import org.sleuthkit.datamodel.AbstractFile;
|
|||||||
* Common methods for utilities that extract text and content and divide into
|
* Common methods for utilities that extract text and content and divide into
|
||||||
* chunks
|
* chunks
|
||||||
*/
|
*/
|
||||||
abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
|
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
|
||||||
|
|
||||||
|
|
||||||
static final List<String> BLOB_MIME_TYPES
|
static final List<String> BLOB_MIME_TYPES
|
||||||
@ -93,6 +95,9 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
|
|||||||
*/
|
*/
|
||||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
long getID(AbstractFile source) {
|
long getID(AbstractFile source) {
|
||||||
return source.getId();
|
return source.getId();
|
||||||
@ -103,4 +108,5 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
|
|||||||
String getName(AbstractFile source) {
|
String getName(AbstractFile source) {
|
||||||
return source.getName();
|
return source.getName();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
||||||
* chunks are indexed with Solr.
|
* chunks are indexed with Solr.
|
||||||
*/
|
*/
|
||||||
class HtmlTextExtractor extends FileTextExtractor<Void> {
|
class HtmlTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
||||||
private static final int MAX_SIZE = 50000000;
|
private static final int MAX_SIZE = 50000000;
|
||||||
@ -54,7 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
|
|||||||
HtmlTextExtractor() {
|
HtmlTextExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean isContentTypeSpecific() {
|
boolean isContentTypeSpecific() {
|
||||||
return true;
|
return true;
|
||||||
@ -76,7 +75,7 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
|
|||||||
* @throws IOException if There is an IOException parsing the input stream.
|
* @throws IOException if There is an IOException parsing the input stream.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException {
|
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||||
try {
|
try {
|
||||||
StringBuilder scripts = new StringBuilder();
|
StringBuilder scripts = new StringBuilder();
|
||||||
StringBuilder links = new StringBuilder();
|
StringBuilder links = new StringBuilder();
|
||||||
@ -172,10 +171,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
Void newAppendixProvider() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||||
return new ReadContentInputStream(sourceFile1);
|
return new ReadContentInputStream(sourceFile1);
|
||||||
}
|
}
|
||||||
|
@ -242,7 +242,7 @@ class Ingester {
|
|||||||
*
|
*
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
<A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||||
final long sourceID = extractor.getID(source);
|
final long sourceID = extractor.getID(source);
|
||||||
final String sourceName = extractor.getName(source);
|
final String sourceName = extractor.getName(source);
|
||||||
|
|
||||||
@ -255,18 +255,9 @@ class Ingester {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Map<String, String> fields = getContentFields(source);
|
Map<String, String> fields = getContentFields(source);
|
||||||
// the appendix will be used to add "meta data" to the end of the last chunk
|
|
||||||
/* JMTODO: we need to figure out how to account for this so the last
|
|
||||||
* chunk doesn't go past 32K
|
|
||||||
*
|
|
||||||
* JM: one idea: push the appendix into the stream that the text
|
|
||||||
* extractor provides so it is automatically chunked with the rest of
|
|
||||||
* the content JMTODO: should this really be in the index at all?
|
|
||||||
*/ A appendix = extractor.newAppendixProvider();
|
|
||||||
|
|
||||||
//Get a stream and a reader for that stream
|
//Get a stream and a reader for that stream
|
||||||
try (final InputStream stream = extractor.getInputStream(source);
|
try (final InputStream stream = extractor.getInputStream(source);
|
||||||
Reader reader = extractor.getReader(stream, source, appendix);) {
|
Reader reader = extractor.getReader(stream, source);) {
|
||||||
|
|
||||||
//we read max 1024 chars at time, this seems to max what some Readers would return
|
//we read max 1024 chars at time, this seems to max what some Readers would return
|
||||||
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||||
@ -303,16 +294,8 @@ class Ingester {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
StringBuilder sb;
|
StringBuilder sb = new StringBuilder(chunkSizeInChars)
|
||||||
if (eof) {
|
.append(textChunkBuf, 0, chunkSizeInChars);
|
||||||
//1000 char buffer is to allow for appendix data with out needing to resize the string builder.
|
|
||||||
sb = new StringBuilder(chunkSizeInChars + 1000)
|
|
||||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
|
||||||
extractor.appendDataToFinalChunk(sb, appendix);
|
|
||||||
} else {
|
|
||||||
sb = new StringBuilder(chunkSizeInChars)
|
|
||||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
|
||||||
}
|
|
||||||
|
|
||||||
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
|
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
//accessed read-only by searcher thread
|
//accessed read-only by searcher thread
|
||||||
|
|
||||||
private boolean startedSearching = false;
|
private boolean startedSearching = false;
|
||||||
private List<FileTextExtractor<?>> textExtractors;
|
private List<FileTextExtractor> textExtractors;
|
||||||
private StringsTextExtractor stringExtractor;
|
private StringsTextExtractor stringExtractor;
|
||||||
private final KeywordSearchJobSettings settings;
|
private final KeywordSearchJobSettings settings;
|
||||||
private boolean initialized = false;
|
private boolean initialized = false;
|
||||||
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
* @throws IngesterException exception thrown if indexing failed
|
* @throws IngesterException exception thrown if indexing failed
|
||||||
*/
|
*/
|
||||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||||
FileTextExtractor<?> extractor = null;
|
FileTextExtractor extractor = null;
|
||||||
|
|
||||||
//go over available text extractors in order, and pick the first one (most specific one)
|
//go over available text extractors in order, and pick the first one (most specific one)
|
||||||
for (FileTextExtractor<?> fe : textExtractors) {
|
for (FileTextExtractor fe : textExtractors) {
|
||||||
if (fe.isSupported(aFile, detectedFormat)) {
|
if (fe.isSupported(aFile, detectedFormat)) {
|
||||||
extractor = fe;
|
extractor = fe;
|
||||||
break;
|
break;
|
||||||
|
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException;
|
|||||||
* with the original source file) up to 1MB then and indexes chunks as text with
|
* with the original source file) up to 1MB then and indexes chunks as text with
|
||||||
* Solr.
|
* Solr.
|
||||||
*/
|
*/
|
||||||
class StringsTextExtractor extends FileTextExtractor<Void> {
|
class StringsTextExtractor extends FileTextExtractor {
|
||||||
/**
|
/**
|
||||||
* Common options that can be used by some extractors
|
* Common options that can be used by some extractors
|
||||||
*/
|
*/
|
||||||
@ -105,7 +105,7 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException {
|
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||||
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,12 +145,6 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
Void newAppendixProvider() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* AbstractFile input string stream reader/converter - given AbstractFile,
|
* AbstractFile input string stream reader/converter - given AbstractFile,
|
||||||
* extract strings from it and return encoded bytes via read()
|
* extract strings from it and return encoded bytes via read()
|
||||||
|
@ -24,7 +24,7 @@ import java.util.logging.Level;
|
|||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||||
|
|
||||||
abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
|
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
|
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
|
||||||
abstract boolean noExtractionOptionsAreEnabled();
|
abstract boolean noExtractionOptionsAreEnabled();
|
||||||
@ -33,15 +33,9 @@ abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisit
|
|||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||||
}
|
}
|
||||||
|
|
||||||
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
|
|
||||||
//no-op
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract AppendixProvider newAppendixProvider();
|
|
||||||
|
|
||||||
abstract InputStream getInputStream(TextSource source);
|
abstract InputStream getInputStream(TextSource source);
|
||||||
|
|
||||||
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
|
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
|
||||||
|
|
||||||
abstract long getID(TextSource source);
|
abstract long getID(TextSource source);
|
||||||
|
|
||||||
|
@ -18,6 +18,8 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import com.google.common.io.CharSource;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -39,16 +41,15 @@ import org.sleuthkit.datamodel.AbstractFile;
|
|||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
* Extractor of text from TIKA supported AbstractFile content. Extracted text
|
||||||
* divided into chunks and indexed with Solr. Protects against Tika parser hangs
|
* will be divided into chunks and indexed with Solr. Protects against Tika
|
||||||
* (for unexpected/corrupt content) using a timeout mechanism. If Tika
|
* parser hangs (for unexpected/corrupt content) using a timeout mechanism. If
|
||||||
* extraction succeeds, chunks are indexed with Solr.
|
* Tika extraction succeeds, chunks are indexed with Solr.
|
||||||
*
|
*
|
||||||
* This Tika extraction/chunking utility is useful for large files of Tika
|
* This Tika extraction/chunking utility is useful for large files of Tika
|
||||||
* parsers-supported content type.
|
* parsers-supported content type.
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
class TikaTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
||||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||||
@ -66,27 +67,15 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
Metadata newAppendixProvider() {
|
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
||||||
return new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
}
|
//Parse the file in a task, a convenient way to have a timeout...
|
||||||
|
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
|
||||||
@Override
|
|
||||||
public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
|
|
||||||
|
|
||||||
//TODO: How do we account for this in chunking algorithm...
|
|
||||||
//JM: what if we always append it as a separate chunk?
|
|
||||||
sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
|
|
||||||
Stream.of(meta.names()).sorted().forEach(key -> {
|
|
||||||
sb.append(key).append(": ").append(meta.get(key)).append("\n");
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException {
|
|
||||||
//Parse the file in a task
|
|
||||||
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
|
|
||||||
try {
|
try {
|
||||||
return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||||
|
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
|
||||||
|
//concatenate parsed content and meta data into a single reader.
|
||||||
|
return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
|
||||||
} catch (TimeoutException te) {
|
} catch (TimeoutException te) {
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
|
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
|
||||||
logWarning(msg, te);
|
logWarning(msg, te);
|
||||||
@ -99,8 +88,24 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
/**
|
||||||
|
* Get a CharSource that wraps a formated representation of the given
|
||||||
|
* Metadata.
|
||||||
|
*
|
||||||
|
* @param metadata The Metadata to wrap as a CharSource
|
||||||
|
*
|
||||||
|
* @returna CharSource for the given MetaData
|
||||||
|
*/
|
||||||
|
static private CharSource getMetaDataCharSource(Metadata metadata) {
|
||||||
|
return CharSource.wrap(
|
||||||
|
new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
|
||||||
|
.append(Stream.of(metadata.names()).sorted()
|
||||||
|
.map(key -> key + ": " + metadata.get(key))
|
||||||
|
.collect(Collectors.joining("\n"))
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public boolean isContentTypeSpecific() {
|
public boolean isContentTypeSpecific() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -130,8 +135,9 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
|||||||
boolean noExtractionOptionsAreEnabled() {
|
boolean noExtractionOptionsAreEnabled() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* return timeout that should be used to index the content
|
* Return timeout that should be used to index the content.
|
||||||
*
|
*
|
||||||
* @param size size of the content
|
* @param size size of the content
|
||||||
*
|
*
|
||||||
@ -152,4 +158,22 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An implementation of CharSource that just wraps an existing reader and
|
||||||
|
* returns it in openStream().
|
||||||
|
*/
|
||||||
|
private static class ReaderCharSource extends CharSource {
|
||||||
|
|
||||||
|
private final Reader reader;
|
||||||
|
|
||||||
|
public ReaderCharSource(Reader reader) {
|
||||||
|
this.reader = reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader openStream() throws IOException {
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user