move all 'appendix' related code into TikaTextExtractor and simplify TextExtractor interface.

2025-07-17 18:17:43 +00:00 · 2016-12-16 14:24:01 +01:00 · 2016-12-16 14:24:01 +01:00 · f56c2b43c8
commit f56c2b43c8
parent 8841f6e773
8 changed files with 74 additions and 82 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@ -33,7 +33,7 @@ import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
-public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifact> {
+public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
    static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
@ -70,10 +70,6 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
        return false;
    }
    @Override
    Void newAppendixProvider() {
        return null;
    }
    @Override
    InputStream getInputStream(BlackboardArtifact artifact) {
@ -118,7 +114,7 @@ public class ArtifactTextExtractor extends TextExtractor<Void, BlackboardArtifac
    }
    @Override
-    Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
+    Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
        return new InputStreamReader(stream);
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -18,6 +18,8 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
 import org.sleuthkit.datamodel.AbstractFile;
@ -26,7 +28,7 @@ import org.sleuthkit.datamodel.AbstractFile;
 * Common methods for utilities that extract text and content and divide into
 * chunks
 */
-abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
+abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
    static final List<String> BLOB_MIME_TYPES
@ -93,6 +95,9 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
     */
    abstract boolean isSupported(AbstractFile file, String detectedFormat);
    @Override
    abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
    @Override
    long getID(AbstractFile source) {
        return source.getId();
@ -103,4 +108,5 @@ abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<Appendi
    String getName(AbstractFile source) {
        return source.getName();
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 * divided into chunks and indexed with Solr. If HTML extraction succeeds,
 * chunks are indexed with Solr.
 */
-class HtmlTextExtractor extends FileTextExtractor<Void> {
+class HtmlTextExtractor extends FileTextExtractor {
    static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
    private static final int MAX_SIZE = 50000000;
@ -54,7 +54,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
    HtmlTextExtractor() {
    }
    @Override
    boolean isContentTypeSpecific() {
        return true;
@ -76,7 +75,7 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
     * @throws IOException if There is an IOException parsing the input stream.
     */
    @Override
-    Reader getReader(InputStream in, AbstractFile sourceFile, Void v) throws Ingester.IngesterException {
+    Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
        try {
            StringBuilder scripts = new StringBuilder();
            StringBuilder links = new StringBuilder();
@ -172,10 +171,6 @@ class HtmlTextExtractor extends FileTextExtractor<Void> {
    }
    @Override
    Void newAppendixProvider() {
        return null;
    }
    InputStream getInputStream(AbstractFile sourceFile1) {
        return new ReadContentInputStream(sourceFile1);
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -242,7 +242,7 @@ class Ingester {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
+    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
        final long sourceID = extractor.getID(source);
        final String sourceName = extractor.getName(source);
@ -255,18 +255,9 @@ class Ingester {
        }
        Map<String, String> fields = getContentFields(source);
        // the appendix will be used to add "meta data" to the end of the last chunk
        /* JMTODO: we need to figure out how to account for this so the last
         * chunk doesn't go past 32K
         *
         * JM: one idea: push the appendix into the stream that the text
         * extractor provides so it is automatically chunked with the rest of
         * the content JMTODO: should this really be in the index at all?
         */ A appendix = extractor.newAppendixProvider();
        //Get a stream and a reader for that stream
        try (final InputStream stream = extractor.getInputStream(source);
-                Reader reader = extractor.getReader(stream, source, appendix);) {
+                Reader reader = extractor.getReader(stream, source);) {
            //we read max 1024 chars at time, this seems to max what some Readers would return
            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
@ -303,16 +294,8 @@ class Ingester {
                    }
                }
-                StringBuilder sb;
+                StringBuilder sb = new StringBuilder(chunkSizeInChars)
-                if (eof) {
+                        .append(textChunkBuf, 0, chunkSizeInChars);
                    //1000 char buffer is to allow for appendix data with out needing to resize the string builder.
                    sb = new StringBuilder(chunkSizeInChars + 1000)
                            .append(textChunkBuf, 0, chunkSizeInChars);
                    extractor.appendDataToFinalChunk(sb, appendix);
                } else {
                    sb = new StringBuilder(chunkSizeInChars)
                            .append(textChunkBuf, 0, chunkSizeInChars);
                }
                sanitizeToUTF8(sb);   //replace non UTF8 chars with '^'
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
    //accessed read-only by searcher thread
    private boolean startedSearching = false;
-    private List<FileTextExtractor<?>> textExtractors;
+    private List<FileTextExtractor> textExtractors;
    private StringsTextExtractor stringExtractor;
    private final KeywordSearchJobSettings settings;
    private boolean initialized = false;
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         * @throws IngesterException exception thrown if indexing failed
         */
        private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
-            FileTextExtractor<?> extractor = null;
+            FileTextExtractor extractor = null;
            //go over available text extractors in order, and pick the first one (most specific one)
-            for (FileTextExtractor<?> fe : textExtractors) {
+            for (FileTextExtractor fe : textExtractors) {
                if (fe.isSupported(aFile, detectedFormat)) {
                    extractor = fe;
                    break;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -37,7 +37,7 @@ import org.sleuthkit.datamodel.TskException;
 * with the original source file) up to 1MB then and indexes chunks as text with
 * Solr.
 */
-class StringsTextExtractor extends FileTextExtractor<Void> {
+class StringsTextExtractor extends FileTextExtractor {
    /**
     * Common options that can be used by some extractors
     */
@ -105,7 +105,7 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
    }
    @Override
-    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile, Void appendix) throws Ingester.IngesterException {
+    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
    }
@ -145,12 +145,6 @@ class StringsTextExtractor extends FileTextExtractor<Void> {
        return true;
    }
    @Override
    Void newAppendixProvider() {
        return null;
    }
    /**
     * AbstractFile input string stream reader/converter - given AbstractFile,
     * extract strings from it and return encoded bytes via read()
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -24,7 +24,7 @@ import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
-abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
+abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
    static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
    abstract boolean noExtractionOptionsAreEnabled();
@ -33,15 +33,9 @@ abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisit
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }
    void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
        //no-op
    }
    abstract AppendixProvider newAppendixProvider();
    abstract InputStream getInputStream(TextSource source);
-    abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
+    abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
    abstract long getID(TextSource source);
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -18,6 +18,8 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import com.google.common.io.CharSource;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.List;
@ -39,16 +41,15 @@ import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 /**
- * Extractor of text from TIKA supported AbstractFile content. Extracted text is
+ * Extractor of text from TIKA supported AbstractFile content. Extracted text
- * divided into chunks and indexed with Solr. Protects against Tika parser hangs
+ * will be divided into chunks and indexed with Solr. Protects against Tika
- * (for unexpected/corrupt content) using a timeout mechanism. If Tika
+ * parser hangs (for unexpected/corrupt content) using a timeout mechanism. If
- * extraction succeeds, chunks are indexed with Solr.
+ * Tika extraction succeeds, chunks are indexed with Solr.
 *
 * This Tika extraction/chunking utility is useful for large files of Tika
 * parsers-supported content type.
 *
 */
-class TikaTextExtractor extends FileTextExtractor<Metadata> {
+class TikaTextExtractor extends FileTextExtractor {
    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
@ -66,27 +67,15 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
    }
    @Override
-    Metadata newAppendixProvider() {
+    Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
-        return new Metadata();
+        Metadata metadata = new Metadata();
-    }
+        //Parse the file in a task, a convenient way to have a timeout...
-
+        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
    @Override
    public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) {
        //TODO: How do we account for this in chunking algorithm...
        //JM: what if we always append it as a separate chunk?
        sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
        Stream.of(meta.names()).sorted().forEach(key -> {
            sb.append(key).append(": ").append(meta.get(key)).append("\n");
        });
    }
    @Override
    Reader getReader(final InputStream stream, AbstractFile sourceFile, Metadata meta) throws IngesterException, MissingResourceException {
        //Parse the file in a task
        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta));
        try {
-            return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+            final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
            CharSource metaDataCharSource = getMetaDataCharSource(metadata);
            //concatenate parsed content and meta data into a single reader.
            return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
        } catch (TimeoutException te) {
            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
            logWarning(msg, te);
@ -99,8 +88,24 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
        }
    }
-    @Override
+    /**
     * Get a CharSource that wraps a formated representation of the given
     * Metadata.
     *
     * @param metadata The Metadata to wrap as a CharSource
     *
     * @returna CharSource for the given MetaData
     */
    static private CharSource getMetaDataCharSource(Metadata metadata) {
        return CharSource.wrap(
                new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
                .append(Stream.of(metadata.names()).sorted()
                        .map(key -> key + ": " + metadata.get(key))
                        .collect(Collectors.joining("\n"))
                ));
    }
    @Override
    public boolean isContentTypeSpecific() {
        return true;
    }
@ -130,8 +135,9 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
    boolean noExtractionOptionsAreEnabled() {
        return false;
    }
    /**
-     * return timeout that should be used to index the content
+     * Return timeout that should be used to index the content.
     *
     * @param size size of the content
     *
@ -152,4 +158,22 @@ class TikaTextExtractor extends FileTextExtractor<Metadata> {
        }
    }
    /**
     * An implementation of CharSource that just wraps an existing reader and
     * returns it in openStream().
     */
    private static class ReaderCharSource extends CharSource {
        private final Reader reader;
        public ReaderCharSource(Reader reader) {
            this.reader = reader;
        }
        @Override
        public Reader openStream() throws IOException {
            return reader;
        }
    }
 }