Merge develop into search_improvements with 32K chunks

2025-07-17 10:17:41 +00:00 · 2017-01-11 10:19:35 -05:00 · 2017-01-11 10:19:35 -05:00 · eb8422ca1e
commit eb8422ca1e
parent c44883a276 877b87b5cf
9 changed files with 337 additions and 249 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java
@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskCoreException;
 * Extracts text from artifacts by concatenating the values of all of the
 * artifact's attributes.
 */
-public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
+class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
    static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
    /**
@ -82,13 +82,16 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
    }
    @Override
-    boolean isDisabled() {
+     public boolean isDisabled() {
        return false;
     }
     @Override
     public void logWarning(final String msg, Exception ex) {
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }
-
+    private InputStream getInputStream(BlackboardArtifact artifact) {
    @Override
    InputStream getInputStream(BlackboardArtifact artifact) {
        // Concatenate the string values of all attributes into a single
        // "content" string to be indexed.
        StringBuilder artifactContents = new StringBuilder();
@ -127,17 +130,17 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
    }
    @Override
-    Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
+    public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
-        return new InputStreamReader(stream, StandardCharsets.UTF_8);
+        return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
    }
    @Override
-    long getID(BlackboardArtifact source) {
+    public long getID(BlackboardArtifact source) {
        return source.getArtifactID();
    }
    @Override
-    String getName(BlackboardArtifact source) {
+    public String getName(BlackboardArtifact source) {
        return source.getDisplayName() + "_" + source.getArtifactID();
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java
@ -18,7 +18,6 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.List;
@ -28,7 +27,7 @@ import org.sleuthkit.datamodel.AbstractFile;
 * Common methods for utilities that extract text and content and divide into
 * chunks
 */
-abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
+abstract class FileTextExtractor implements TextExtractor< AbstractFile> {
    static final List<String> BLOB_MIME_TYPES
@ -96,17 +95,16 @@ abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
    abstract boolean isSupported(AbstractFile file, String detectedFormat);
    @Override
-    abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
+    public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;
    @Override
-    long getID(AbstractFile source) {
+    public long getID(AbstractFile source) {
        return source.getId();
    }
    @Override
-    String getName(AbstractFile source) {
+    public String getName(AbstractFile source) {
        return source.getName();
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -19,16 +19,17 @@
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.List;
 import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Renderer;
 import net.htmlparser.jericho.Source;
 import net.htmlparser.jericho.StartTag;
 import net.htmlparser.jericho.StartTagType;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@ -37,6 +38,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 */
 class HtmlTextExtractor extends FileTextExtractor {
    static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
    private static final int MAX_SIZE = 50_000_000; //50MB
    static final List<String> WEB_MIME_TYPES = Arrays.asList(
@ -61,7 +63,9 @@ class HtmlTextExtractor extends FileTextExtractor {
    }
    @Override
-    Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
+    public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
        //Parse the stream with Jericho and put the results in a Reader
        try {
            StringBuilder scripts = new StringBuilder();
@ -75,7 +79,7 @@ class HtmlTextExtractor extends FileTextExtractor {
            int numComments = 0;
            int numOthers = 0;
-            Source source = new Source(in);
+            Source source = new Source(stream);
            source.fullSequentialParse();
            Renderer renderer = source.getRenderer();
            renderer.setNewLine("\n");
@ -158,12 +162,11 @@ class HtmlTextExtractor extends FileTextExtractor {
    }
    @Override
-    InputStream getInputStream(AbstractFile sourceFile1) {
+    public boolean isDisabled() {
        return new ReadContentInputStream(sourceFile1);
    }
    @Override
    boolean isDisabled() {
        return false;
    }
    public void logWarning(final String msg, Exception ex) {
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -18,11 +18,13 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import com.google.common.base.Utf8;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.logging.Level;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.common.SolrInputDocument;
@ -54,9 +56,6 @@ class Ingester {
    private final Server solrServer = KeywordSearch.getServer();
    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
    private static Ingester instance;
    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
    private static final int SINGLE_READ_CHARS = 1024;
    private static final int EXTRA_CHARS = 128;
    private Ingester() {
    }
@ -120,6 +119,136 @@ class Ingester {
        return item.accept(SOLR_FIELDS_VISITOR);
    }
    /**
     * Use the given TextExtractor to extract text from the given source. The
     * text will be chunked and each chunk passed to Solr to add to the index.
     *
     *
     * @param <A>       The type of the Appendix provider that provides
     *                  additional text to append to the final chunk.
     * @param <T>       A subclass of SleuthkitVisibleItem.
     * @param extractor The TextExtractor that will be used to extract text from
     *                  the given source.
     * @param source    The source from which text will be extracted, chunked,
     *                  and indexed.
     * @param context   The ingest job context that can be used to cancel this
     *                  process.
     *
     * @return True if this method executed normally. or False if there was an
     *         unexpected exception. //JMTODO: This policy needs to be reviewed.
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
        final long sourceID = extractor.getID(source);
        final String sourceName = extractor.getName(source);
        int numChunks = 0; //unknown until chunking is done
        if (extractor.isDisabled()) {
            /* some Extrctors, notable the strings extractor, have options which
             * can be configured such that no extraction should be done */
            return true;
        }
        Map<String, String> fields = getContentFields(source);
        //Get a reader for the content of the given source
        try (Reader reader = extractor.getReader(source);) {
            Chunker chunker = new Chunker(reader);
            for (Chunk chunk : chunker) {
                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                fields.put(Server.Schema.ID.toString(), chunkId);
                try {
                    //add the chunk text to Solr index
                    indexChunk(chunk.getText().toString(), sourceName, fields);
                    numChunks++;
                } catch (Ingester.IngesterException ingEx) {
                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
                    throw ingEx; //need to rethrow to signal error and move on
                } catch (Exception ex) {
                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
                }
            }
        } catch (IOException ex) {
            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } catch (Exception ex) {
            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } finally {
            //after all chunks, index just the meta data, including the  numChunks, of the parent file
            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
            indexChunk(null, sourceName, fields);
        }
        return true;
    }
    /**
     * Add one chunk as to the Solr index as a seperate sold document.
     *
     * TODO see if can use a byte or string streaming way to add content to
     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
     * 4.0.0), see if possible to stream with UpdateRequestHandler
     *
     * @param chunk  The chunk content as a string
     * @param fields
     * @param size
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
            //JMTODO: actually if the we couldn't get the image id it is set to -1,
            // but does this really mean we don't want to index it?
            //skip the file, image id unknown
            //JMTODO: does this need to ne internationalized?
            String msg = NbBundle.getMessage(Ingester.class,
                    "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
            logger.log(Level.SEVERE, msg);
            throw new IngesterException(msg);
        }
        //Make a SolrInputDocument out of the field map
        SolrInputDocument updateDoc = new SolrInputDocument();
        for (String key : fields.keySet()) {
            updateDoc.addField(key, fields.get(key));
        }
        //add the content to the SolrInputDocument
        //JMTODO: can we just add it to the field map before passing that in?
        updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
        try {
            //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
            solrServer.addDocument(updateDoc);
            uncommitedIngests = true;
        } catch (KeywordSearchModuleException ex) {
            //JMTODO: does this need to ne internationalized?
            throw new IngesterException(
                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
        }
    }
    /**
     * Tells Solr to commit (necessary before ingested files will appear in
     * searches)
     */
    void commit() {
        try {
            solrServer.commit();
            uncommitedIngests = false;
        } catch (NoOpenCoreException | SolrServerException ex) {
            logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
        }
    }
    /**
     * Visitor used to create fields to send to SOLR index.
     */
@ -221,192 +350,6 @@ class Ingester {
        }
    }
    /**
     * Use the given TextExtractor to extract text from the given source. The
     * text will be chunked and each chunk passed to Solr to add to the index.
     *
     *
     * @param <A>       The type of the Appendix provider that provides
     *                  additional text to append to the final chunk.
     * @param <T>       A subclass of SleuthkitVisibleItem.
     * @param extractor The TextExtractor that will be used to extract text from
     *                  the given source.
     * @param source    The source from which text will be extracted, chunked,
     *                  and indexed.
     * @param context   The ingest job context that can be used to cancel this
     *                  process.
     *
     * @return True if this method executed normally. or False if there was an
     *         unexpected exception. //JMTODO: This policy needs to be reviewed.
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
    < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
        final long sourceID = extractor.getID(source);
        final String sourceName = extractor.getName(source);
        int numChunks = 0; //unknown until chunking is done
        if (extractor.isDisabled()) {
            /* some Extrctors, notable the strings extractor, have options which
             * can be configured such that no extraction should be done */
            return true;
        }
        Map<String, String> fields = getContentFields(source);
        //Get a stream and a reader for that stream
        try (final InputStream stream = extractor.getInputStream(source);
                Reader reader = extractor.getReader(stream, source);) {
            //we read max 1024 chars at time, this seems to max what some Readers would return
            char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
            boolean eof = false;  //have we read until the end of the file yet
            while (!eof) {
                int chunkSizeInChars = 0;  // the size in chars of the chunk (so far)
                if (context != null && context.fileIngestIsCancelled()) {
                    return true;
                }
                long charsRead = 0;  // number of chars read in the most recent read operation
                //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
                while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
                        && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
                    chunkSizeInChars += charsRead;
                }
                if (charsRead == -1) {
                    //this is the last chunk
                    eof = true;
                } else {
                    chunkSizeInChars += charsRead;
                    //if we haven't reached the end of the file,
                    //try to read char-by-char until whitespace to not break words
                    while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
                            && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
                            && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
                        chunkSizeInChars += charsRead;
                    }
                    if (charsRead == -1) {
                        //this is the last chunk
                        eof = true;
                    }
                }
                StringBuilder sb = new StringBuilder(chunkSizeInChars)
                        .append(textChunkBuf, 0, chunkSizeInChars);
                sanitizeToUTF8(sb);   //replace non UTF8 chars with '^'
                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                fields.put(Server.Schema.ID.toString(), chunkId);
                try {
                    //pass the chunk to method that adds it to Solr index
                    indexChunk(sb.toString(), sourceName, fields);
                    numChunks++;
                } catch (Ingester.IngesterException ingEx) {
                    extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
                    throw ingEx; //need to rethrow to signal error and move on
                } catch (Exception ex) {
                    throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
                }
            }
        } catch (IOException ex) {
            extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } catch (Exception ex) {
            extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } finally {
            //after all chunks, index just the meta data, including the  numChunks, of the parent file
            fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
            fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
            indexChunk(null, sourceName, fields);
        }
        return true;
    }
    /**
     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
     * caret '^'
     *
     * @param sb the StringBuilder to sanitize
     *
     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
     * function?
     */
    private static void sanitizeToUTF8(StringBuilder sb) {
        final int length = sb.length();
        // Sanitize by replacing non-UTF-8 characters with caret '^'
        for (int i = 0; i < length; i++) {
            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
                sb.replace(i, i + 1, "^");
            }
        }
    }
    /**
     * Add one chunk as to the Solr index as a seperate sold document.
     *
     * TODO see if can use a byte or string streaming way to add content to
     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
     * 4.0.0), see if possible to stream with UpdateRequestHandler
     *
     * @param chunk  The chunk content as a string
     * @param fields
     * @param size
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
            //JMTODO: actually if the we couldn't get the image id it is set to -1,
            // but does this really mean we don't want to index it?
            //skip the file, image id unknown
            //JMTODO: does this need to ne internationalized?
            String msg = NbBundle.getMessage(Ingester.class,
                    "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
            logger.log(Level.SEVERE, msg);
            throw new IngesterException(msg);
        }
        //Make a SolrInputDocument out of the field map
        SolrInputDocument updateDoc = new SolrInputDocument();
        for (String key : fields.keySet()) {
            updateDoc.addField(key, fields.get(key));
        }
        //add the content to the SolrInputDocument
        //JMTODO: can we just add it to the field map before passing that in?
        updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
        try {
            //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
            solrServer.addDocument(updateDoc);
            uncommitedIngests = true;
        } catch (KeywordSearchModuleException ex) {
            //JMTODO: does this need to ne internationalized?
            throw new IngesterException(
                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
        }
    }
    /**
     * Tells Solr to commit (necessary before ingested files will appear in
     * searches)
     */
    void commit() {
        try {
            solrServer.commit();
            uncommitedIngests = false;
        } catch (NoOpenCoreException | SolrServerException ex) {
            logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
        }
    }
    /**
     * Indicates that there was an error with the specific ingest operation, but
     * it's still okay to continue ingesting files.
@ -423,4 +366,146 @@ class Ingester {
            super(message);
        }
    }
 }
 class Chunk {
    private final StringBuilder sb;
    private final int chunksize;
    Chunk(StringBuilder sb, int chunksize) {
        this.sb = sb;
        this.chunksize = chunksize;
    }
    StringBuilder getText() {
        return sb;
    }
    int getSize() {
        return chunksize;
    }
 }
 /**
 * Encapsulates the content chunking algorithm in implementation of the Iterator
 * interface.
 */
 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
    private static final int INITIAL_CHUNK_SIZE = 32 * 1024; //bytes
    private static final int SINGLE_READ_CHARS = 1024;
    private int chunkSizeBytes = 0;  // the size in bytes of chunk (so far)
    private int charsRead = 0;  // number of chars read in the most recent read operation
    private boolean whitespace = false;
    private char[] tempChunkBuf;
    private StringBuilder chunkText;
    private boolean endOfContent = false;
    private final Reader reader;
    /**
     * Create a Chunker that will chunk the content of the given Reader.
     *
     * @param reader The content to chunk.
     */
    Chunker(Reader reader) {
        this.reader = reader;
    }
    @Override
    public Iterator<Chunk> iterator() {
        return this;
    }
    /**
     * Are there any more chunks available from this chunker?
     *
     *
     * @return true if there are more chunks available.
     */
    @Override
    public boolean hasNext() {
        return endOfContent == false;
    }
    @Override
    public Chunk next() {
        if (hasNext()) {
            chunkText = new StringBuilder();
            tempChunkBuf = new char[SINGLE_READ_CHARS];
            chunkSizeBytes = 0;
            //read chars up to initial chunk size
            while (chunkSizeBytes < INITIAL_CHUNK_SIZE && endOfContent == false) {
                try {
                    charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
                } catch (IOException ex) {
                    throw new RuntimeException("IOException while attempting to read chunk.", ex);
                }
                    if (-1 == charsRead) {
                        //this is the last chunk
                        endOfContent = true;
                    } else {
                        String chunkSegment = new String(tempChunkBuf, 0, charsRead);
                        chunkSizeBytes += Utf8.encodedLength(chunkSegment);
                        chunkText.append(chunkSegment);
                    }
            }
            if (false == endOfContent) {
                endOfContent = readChunkUntilWhiteSpace();
            }
            return new Chunk(sanitizeToUTF8(chunkText), chunkSizeBytes);
        } else {
            throw new NoSuchElementException("There are no more chunks.");
        }
    }
    private boolean readChunkUntilWhiteSpace() {
        charsRead = 0;
        whitespace = false;
        //if we haven't reached the end of the file,
        //try to read char-by-char until whitespace to not break words
        while ((chunkSizeBytes < INITIAL_CHUNK_SIZE)
                && (false == whitespace)) {
            try {
                charsRead = reader.read(tempChunkBuf, 0, 1);
            } catch (IOException ex) {
                throw new RuntimeException("IOException while attempting to read chunk until whitespace.", ex);
            }
            if (-1 == charsRead) {
                //this is the last chunk
                return true;
            } else {
                whitespace = Character.isWhitespace(tempChunkBuf[0]);
                String chunkSegment = new String(tempChunkBuf, 0, 1);
                chunkSizeBytes += Utf8.encodedLength(chunkSegment);
                chunkText.append(chunkSegment);
            }
        }
        return false;
    }
    /**
     * Sanitize the given StringBuilder by replacing non-UTF-8 characters with
     * caret '^'
     *
     * @param sb the StringBuilder to sanitize
     *
     * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
     * function?
     */
    private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
        final int length = sb.length();
        // Sanitize by replacing non-UTF-8 characters with caret '^'
        for (int i = 0; i < length; i++) {
            if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
                sb.replace(i, i + 1, "^");
            }
        }
        return sb;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
@ -157,7 +157,7 @@ public class Server {
            public String toString() {
                return "num_chunks"; //NON-NLS
            }
-        },
+        }
    };
    public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java
@ -19,21 +19,21 @@
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.IOException;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
 import org.openide.util.NbBundle;
 import java.net.InetAddress;
 import java.util.List;
 import java.util.MissingResourceException;
-import org.sleuthkit.autopsy.core.RuntimeProperties;
+import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
 import org.openide.util.lookup.ServiceProviders;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.core.RuntimeProperties;
 import org.sleuthkit.autopsy.corecomponentinterfaces.AutopsyService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
 import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.TskCoreException;
 import org.sleuthkit.autopsy.corecomponentinterfaces.AutopsyService;
 /**
 * An implementation of the KeywordSearchService interface that uses Solr for
@ -48,6 +48,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService
    private static final String BAD_IP_ADDRESS_FORMAT = "ioexception occurred when talking to server"; //NON-NLS
    private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
    private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
    private static final String SERVICE_NAME = "Solr Keyword Search Service";
    ArtifactTextExtractor extractor = new ArtifactTextExtractor();
@ -210,4 +211,9 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService
         * Autopsy service providers may not have case-level resources.
         */
    }
    @Override
    public String getServiceName() {
        return SERVICE_NAME;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -25,6 +25,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@ -37,6 +38,8 @@ import org.sleuthkit.datamodel.TskException;
 */
 class StringsTextExtractor extends FileTextExtractor {
    static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
    /**
     * Options for this extractor
     */
@ -92,7 +95,12 @@ class StringsTextExtractor extends FileTextExtractor {
    }
    @Override
-    boolean isDisabled() {
+    public void logWarning(final String msg, Exception ex) {
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }
    @Override
    public boolean isDisabled() {
        boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
        boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
@ -100,11 +108,11 @@ class StringsTextExtractor extends FileTextExtractor {
    }
    @Override
-    InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
+    public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
        InputStream stringStream = getInputStream(sourceFile);
        return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
    }
    @Override
    InputStream getInputStream(AbstractFile sourceFile) {
        //check which extract stream to use
        if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java
@ -18,10 +18,7 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 /**
@ -31,9 +28,8 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
 * @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
 *                     is able to process.
 */
-abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
+interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
    static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
    /**
     * Is this extractor configured such that no extraction will/should be done?
@ -48,18 +44,8 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
     * @param msg
     * @param ex
     */
-    void logWarning(String msg, Exception ex) {
+    abstract void logWarning(String msg, Exception ex);
        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }
    /**
     * Get an input stream over the content of the given source.
     *
     * @param source
     *
     * @return
     */
    abstract InputStream getInputStream(TextSource source);
    /**
     * Get a reader that over the text extracted from the given source.
@ -71,7 +57,7 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
+    abstract Reader getReader(TextSource source) throws Ingester.IngesterException;
    /**
     * Get the 'object' id of the given source.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch;
 import com.google.common.io.CharSource;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.util.List;
 import java.util.MissingResourceException;
@ -36,6 +35,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@ -46,6 +46,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
 */
 class TikaTextExtractor extends FileTextExtractor {
    static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
    private static final List<String> TIKA_SUPPORTED_TYPES
@ -55,13 +56,15 @@ class TikaTextExtractor extends FileTextExtractor {
            .collect(Collectors.toList());
    @Override
-    void logWarning(final String msg, Exception ex) {
+    public void logWarning(final String msg, Exception ex) {
        KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
-        super.logWarning(msg, ex);
+        logger.log(Level.WARNING, msg, ex); //NON-NLS  }
    }
    @Override
-    Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
+    public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
        ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
        Metadata metadata = new Metadata();
        //Parse the file in a task, a convenient way to have a timeout...
        final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
@ -117,13 +120,9 @@ class TikaTextExtractor extends FileTextExtractor {
        return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
    }
    @Override
    InputStream getInputStream(AbstractFile sourceFile1) {
        return new ReadContentInputStream(sourceFile1);
    }
    @Override
-    boolean isDisabled() {
+    public boolean isDisabled() {
        return false;
    }