diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java index 0c1caeebe2..501971e2e6 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java @@ -33,7 +33,7 @@ import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.SleuthkitCase; import org.sleuthkit.datamodel.TskCoreException; -public class ArtifactTextExtractor extends TextExtractor { +public class ArtifactTextExtractor extends TextExtractor { static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName()); static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException { @@ -70,10 +70,6 @@ public class ArtifactTextExtractor extends TextExtractor extends TextExtractor { +abstract class FileTextExtractor extends TextExtractor< AbstractFile> { static final List BLOB_MIME_TYPES @@ -93,6 +95,9 @@ abstract class FileTextExtractor extends TextExtractor extends TextExtractor { +class HtmlTextExtractor extends FileTextExtractor { static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; private static final int MAX_SIZE = 50000000; @@ -54,7 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor { HtmlTextExtractor() { } - @Override boolean isContentTypeSpecific() { return true; @@ -76,7 +75,7 @@ class HtmlTextExtractor extends FileTextExtractor { * @throws IOException if There is an IOException parsing the input stream. */ @Override - Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException { + Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException { try { StringBuilder scripts = new StringBuilder(); StringBuilder links = new StringBuilder(); @@ -172,10 +171,6 @@ class HtmlTextExtractor extends FileTextExtractor { } @Override - Void newAppendixProvider() { - return null; - } - InputStream getInputStream(AbstractFile sourceFile1) { return new ReadContentInputStream(sourceFile1); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 229a751f76..2bb20c6060 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -242,7 +242,7 @@ class Ingester { * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - boolean indexText(TextExtractor extractor, T source, IngestJobContext context) throws Ingester.IngesterException { + < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException { final long sourceID = extractor.getID(source); final String sourceName = extractor.getName(source); @@ -255,18 +255,9 @@ class Ingester { } Map fields = getContentFields(source); - // the appendix will be used to add "meta data" to the end of the last chunk - /* JMTODO: we need to figure out how to account for this so the last - * chunk doesn't go past 32K - * - * JM: one idea: push the appendix into the stream that the text - * extractor provides so it is automatically chunked with the rest of - * the content JMTODO: should this really be in the index at all? - */ A appendix = extractor.newAppendixProvider(); - //Get a stream and a reader for that stream try (final InputStream stream = extractor.getInputStream(source); - Reader reader = extractor.getReader(stream, source, appendix);) { + Reader reader = extractor.getReader(stream, source);) { //we read max 1024 chars at time, this seems to max what some Readers would return char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; @@ -303,16 +294,8 @@ class Ingester { } } - StringBuilder sb; - if (eof) { - //1000 char buffer is to allow for appendix data with out needing to resize the string builder. - sb = new StringBuilder(chunkSizeInChars + 1000) - .append(textChunkBuf, 0, chunkSizeInChars); - extractor.appendDataToFinalChunk(sb, appendix); - } else { - sb = new StringBuilder(chunkSizeInChars) - .append(textChunkBuf, 0, chunkSizeInChars); - } + StringBuilder sb = new StringBuilder(chunkSizeInChars) + .append(textChunkBuf, 0, chunkSizeInChars); sanitizeToUTF8(sb); //replace non UTF8 chars with '^' diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index ceb810c444..ad70144aab 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { //accessed read-only by searcher thread private boolean startedSearching = false; - private List> textExtractors; + private List textExtractors; private StringsTextExtractor stringExtractor; private final KeywordSearchJobSettings settings; private boolean initialized = false; @@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule { * @throws IngesterException exception thrown if indexing failed */ private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException { - FileTextExtractor extractor = null; + FileTextExtractor extractor = null; //go over available text extractors in order, and pick the first one (most specific one) - for (FileTextExtractor fe : textExtractors) { + for (FileTextExtractor fe : textExtractors) { if (fe.isSupported(aFile, detectedFormat)) { extractor = fe; break; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java index 8bf8d21910..97c5307138 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java @@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException; * with the original source file) up to 1MB then and indexes chunks as text with * Solr. */ -class StringsTextExtractor extends FileTextExtractor { +class StringsTextExtractor extends FileTextExtractor { /** * Common options that can be used by some extractors */ @@ -105,7 +105,7 @@ class StringsTextExtractor extends FileTextExtractor { } @Override - InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException { + InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException { return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET); } @@ -145,12 +145,6 @@ class StringsTextExtractor extends FileTextExtractor { return true; } - - @Override - Void newAppendixProvider() { - return null; - } - /** * AbstractFile input string stream reader/converter - given AbstractFile, * extract strings from it and return encoded bytes via read() diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java index 2e1d3280bd..c4d808cdcd 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java @@ -24,7 +24,7 @@ import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.SleuthkitVisitableItem; -abstract class TextExtractor { +abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> { static final private Logger logger = Logger.getLogger(TextExtractor.class.getName()); abstract boolean noExtractionOptionsAreEnabled(); @@ -33,15 +33,9 @@ abstract class TextExtractor { +class TikaTextExtractor extends FileTextExtractor { private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); @@ -66,27 +67,15 @@ class TikaTextExtractor extends FileTextExtractor { } @Override - Metadata newAppendixProvider() { - return new Metadata(); - } - - @Override - public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) { - - //TODO: How do we account for this in chunking algorithm... - //JM: what if we always append it as a separate chunk? - sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS - Stream.of(meta.names()).sorted().forEach(key -> { - sb.append(key).append(": ").append(meta.get(key)).append("\n"); - }); - } - - @Override - Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException { - //Parse the file in a task - final Future future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta)); + Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException { + Metadata metadata = new Metadata(); + //Parse the file in a task, a convenient way to have a timeout... + final Future future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata)); try { - return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); + final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); + CharSource metaDataCharSource = getMetaDataCharSource(metadata); + //concatenate parsed content and meta data into a single reader. + return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream(); } catch (TimeoutException te) { final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName()); logWarning(msg, te); @@ -99,8 +88,24 @@ class TikaTextExtractor extends FileTextExtractor { } } - @Override + /** + * Get a CharSource that wraps a formated representation of the given + * Metadata. + * + * @param metadata The Metadata to wrap as a CharSource + * + * @returna CharSource for the given MetaData + */ + static private CharSource getMetaDataCharSource(Metadata metadata) { + return CharSource.wrap( + new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n") + .append(Stream.of(metadata.names()).sorted() + .map(key -> key + ": " + metadata.get(key)) + .collect(Collectors.joining("\n")) + )); + } + @Override public boolean isContentTypeSpecific() { return true; } @@ -130,8 +135,9 @@ class TikaTextExtractor extends FileTextExtractor { boolean noExtractionOptionsAreEnabled() { return false; } + /** - * return timeout that should be used to index the content + * Return timeout that should be used to index the content. * * @param size size of the content * @@ -152,4 +158,22 @@ class TikaTextExtractor extends FileTextExtractor { } } + + /** + * An implementation of CharSource that just wraps an existing reader and + * returns it in openStream(). + */ + private static class ReaderCharSource extends CharSource { + + private final Reader reader; + + public ReaderCharSource(Reader reader) { + this.reader = reader; + } + + @Override + public Reader openStream() throws IOException { + return reader; + } + } }