move all 'appendix' related code into TikaTextExtractor and simplify TextExtractor interface.

2025-07-17 10:17:41 +00:00 · 2016-12-16 14:24:01 +01:00 · 2016-12-16 14:24:01 +01:00 · f56c2b43c8
commit f56c2b43c8
parent 8841f6e773
8 changed files with 74 additions and 82 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@ -33,7 +33,7 @@ import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;

-public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifact> {
+public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());

    static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
@ -70,10 +70,6 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
        return false;
    }

-    @Override
-    Void newAppendixProvider() {
-        return null;
-    }

    @Override
    InputStream getInputStream(BlackboardArtifact artifact) {
@ -118,7 +114,7 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
    }

    @Override
-    Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
+    Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
        return new InputStreamReader(stream);
    }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -18,6 +18,8 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

+import java.io.InputStream;
+import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
@ -26,7 +28,7 @@ import org.sleuthkit.datamodel.AbstractFile;
 * Common methods for utilities that extract text and content and divide into
 * chunks
 */
-abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
+abstract class FileTextExtractor extends TextExtractor< AbstractFile> {


    static final List<String> BLOB_MIME_TYPES
@ -93,6 +95,9 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
     */
    abstract boolean isSupported(AbstractFile file, String detectedFormat);

+    @Override
+    abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
+
    @Override
    long getID(AbstractFile source) {
        return source.getId();
@ -103,4 +108,5 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
    String getName(AbstractFile source) {
        return source.getName();
    }
+
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 * divided into chunks and indexed with Solr. If HTML extraction succeeds,
 * chunks are indexed with Solr.
 */
-class HtmlTextExtractor extends FileTextExtractor<Void> {
+class HtmlTextExtractor extends FileTextExtractor {

    static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
    private static final int MAX_SIZE = 50000000;
@ -54,7 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
    HtmlTextExtractor() {
    }

-
    @Override
    boolean isContentTypeSpecific() {
        return true;
@ -76,7 +75,7 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
     * @throws IOException if There is an IOException parsing the input stream.
     */
    @Override
-    Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException {
+    Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
        try {
            StringBuilder scripts = new StringBuilder();
            StringBuilder links = new StringBuilder();
@ -172,10 +171,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
    }

    @Override
-    Void newAppendixProvider() {
-        return null;
-    }
-
    InputStream getInputStream(AbstractFile sourceFile1) {
        return new ReadContentInputStream(sourceFile1);
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -242,7 +242,7 @@ class Ingester {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
+    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
        final long sourceID = extractor.getID(source);
        final String sourceName = extractor.getName(source);

@ -255,18 +255,9 @@ class Ingester {
        }

        Map<String, String> fields = getContentFields(source);
-        // the appendix will be used to add "meta data" to the end of the last chunk
-        /* JMTODO: we need to figure out how to account for this so the last
-         * chunk doesn't go past 32K
-         *
-         * JM: one idea: push the appendix into the stream that the text
-         * extractor provides so it is automatically chunked with the rest of
-         * the content JMTODO: should this really be in the index at all?
-         */ A appendix = extractor.newAppendixProvider();
-
        //Get a stream and a reader for that stream
        try (final InputStream stream = extractor.getInputStream(source);
-                Reader reader = extractor.getReader(stream, source, appendix);) {
+                Reader reader = extractor.getReader(stream, source);) {

            //we read max 1024 chars at time, this seems to max what some Readers would return
            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
@ -303,16 +294,8 @@ class Ingester {
                    }
                }

-                StringBuilder sb;
-                if (eof) {
-                    //1000 char buffer is to allow for appendix data with out needing to resize the string builder.
-                    sb = new StringBuilder(chunkSizeInChars + 1000)
-                            .append(textChunkBuf, 0, chunkSizeInChars);
-                    extractor.appendDataToFinalChunk(sb, appendix);
-                } else {
-                    sb = new StringBuilder(chunkSizeInChars)
-                            .append(textChunkBuf, 0, chunkSizeInChars);
-                }
+                StringBuilder sb = new StringBuilder(chunkSizeInChars)
+                        .append(textChunkBuf, 0, chunkSizeInChars);

                sanitizeToUTF8(sb);   //replace non UTF8 chars with '^'

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
    //accessed read-only by searcher thread

    private boolean startedSearching = false;
-    private List<FileTextExtractor<?>> textExtractors;
+    private List<FileTextExtractor> textExtractors;
    private StringsTextExtractor stringExtractor;
    private final KeywordSearchJobSettings settings;
    private boolean initialized = false;
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         * @throws IngesterException exception thrown if indexing failed
         */
        private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            FileTextExtractor<?> extractor = null;
+            FileTextExtractor extractor = null;

            //go over available text extractors in order, and pick the first one (most specific one)
-            for (FileTextExtractor<?> fe : textExtractors) {
+            for (FileTextExtractor fe : textExtractors) {
                if (fe.isSupported(aFile, detectedFormat)) {
                    extractor = fe;
                    break;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException;
 * with the original source file) up to 1MB then and indexes chunks as text with
 * Solr.
 */
-class StringsTextExtractor extends FileTextExtractor<Void> {
+class StringsTextExtractor extends FileTextExtractor {
    /**
     * Common options that can be used by some extractors
     */
@ -105,7 +105,7 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
    }

    @Override
-    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException {
+    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
    }

@ -145,12 +145,6 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
        return true;
    }

-
-    @Override
-    Void newAppendixProvider() {
-        return null;
-    }
-
    /**
     * AbstractFile input string stream reader/converter - given AbstractFile,
     * extract strings from it and return encoded bytes via read()
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -24,7 +24,7 @@ import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;

-abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
+abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {

    static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
    abstract boolean noExtractionOptionsAreEnabled();
@ -33,15 +33,9 @@ abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisit
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }

-    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
-        //no-op
-    }
-
-    abstract AppendixProvider newAppendixProvider();
-
    abstract InputStream getInputStream(TextSource source);

-    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
+    abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;

    abstract long getID(TextSource source);

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -18,6 +18,8 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

+import com.google.common.io.CharSource;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.List;
@ -39,16 +41,15 @@ import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;

 /**
- * Extractor of text from TIKA supported AbstractFile content. Extracted text is
- * divided into chunks and indexed with Solr. Protects against Tika parser hangs
- * (for unexpected/corrupt content) using a timeout mechanism. If Tika
- * extraction succeeds, chunks are indexed with Solr.
+ * Extractor of text from TIKA supported AbstractFile content. Extracted text
+ * will be divided into chunks and indexed with Solr. Protects against Tika
+ * parser hangs (for unexpected/corrupt content) using a timeout mechanism. If
+ * Tika extraction succeeds, chunks are indexed with Solr.
 *
 * This Tika extraction/chunking utility is useful for large files of Tika
 * parsers-supported content type.
- *
 */
-class TikaTextExtractor extends FileTextExtractor<Metadata> {
+class TikaTextExtractor extends FileTextExtractor {

    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
@ -66,27 +67,15 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
    }

    @Override
-    Metadata newAppendixProvider() {
-        return new Metadata();
-    }
-
-    @Override
-    public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
-
-        //TODO: How do we account for this in chunking algorithm...
-        //JM: what if we always append it as a separate chunk?
-        sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
-        Stream.of(meta.names()).sorted().forEach(key -> {
-            sb.append(key).append(": ").append(meta.get(key)).append("\n");
-        });
-    }
-
-    @Override
-    Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException {
-        //Parse the file in a task
-        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
+    Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+        Metadata metadata = new Metadata();
+        //Parse the file in a task, a convenient way to have a timeout...
+        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
        try {
-            return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+            final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+            CharSource metaDataCharSource = getMetaDataCharSource(metadata);
+            //concatenate parsed content and meta data into a single reader.
+            return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
        } catch (TimeoutException te) {
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
            logWarning(msg, te);
@ -99,8 +88,24 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
        }
    }

-    @Override
+    /**
+     * Get a CharSource that wraps a formated representation of the given
+     * Metadata.
+     *
+     * @param metadata The Metadata to wrap as a CharSource
+     *
+     * @returna CharSource for the given MetaData
+     */
+    static private CharSource getMetaDataCharSource(Metadata metadata) {
+        return CharSource.wrap(
+                new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
+                .append(Stream.of(metadata.names()).sorted()
+                        .map(key -> key + ": " + metadata.get(key))
+                        .collect(Collectors.joining("\n"))
+                ));
+    }

+    @Override
    public boolean isContentTypeSpecific() {
        return true;
    }
@ -130,8 +135,9 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
    boolean noExtractionOptionsAreEnabled() {
        return false;
    }
+
    /**
-     * return timeout that should be used to index the content
+     * Return timeout that should be used to index the content.
     *
     * @param size size of the content
     *
@ -152,4 +158,22 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
        }

    }
+
+    /**
+     * An implementation of CharSource that just wraps an existing reader and
+     * returns it in openStream().
+     */
+    private static class ReaderCharSource extends CharSource {
+
+        private final Reader reader;
+
+        public ReaderCharSource(Reader reader) {
+            this.reader = reader;
+        }
+
+        @Override
+        public Reader openStream() throws IOException {
+            return reader;
+        }
+    }
 }