Generalize text extractors more so we support multiple extractors in keyword search that are ordered from more to less specific ones.

Integrate html text extractor into keyword search.
2025-07-13 00:16:16 +00:00 · 2012-07-25 12:19:32 -04:00 · 2012-07-25 12:19:32 -04:00 · 27e04f16d1
commit 27e04f16d1
parent ca87852431
6 changed files with 173 additions and 98 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java
@ -41,8 +41,24 @@ interface AbstractFileExtract {
    /**
     * Index the Abstract File
     * @param sourceFile file to index
     * @return true if indexed successfully, false otherwise
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException 
     */
-    boolean index() throws Ingester.IngesterException;
+    boolean index(AbstractFile sourceFile) throws Ingester.IngesterException;
    /**
     * Determines if the extractor works only for specified types
     * is supportedTypes() or whether is a generic content extractor (such as string extractor)
     * @return 
     */
    boolean isContentTypeSpecific();
    /**
     * Determines if the file content is supported by the extractor, 
     * if isContentTypeSpecific() returns true.
     * @param file to test if its content should be supported
     * @return true if the file content is supported, false otherwise
     */
    boolean isSupported(AbstractFile file);
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java
@ -46,12 +46,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
    private AbstractFile sourceFile;
    private int numChunks = 0;
    private static final String UTF16BOM = "\uFEFF";
    private static final String [] SUPPORTED_EXTENSIONS = {"htm", "html", "xhtml", "css", "js"};
-    AbstractFileHtmlExtract(AbstractFile sourceFile) {
+    AbstractFileHtmlExtract() {
        this.sourceFile = sourceFile;
        this.service = KeywordSearchIngestService.getDefault();
-        Server solrServer = KeywordSearch.getServer();
+        ingester = Server.getIngester();
        ingester = solrServer.getIngester();
    }
    @Override
@ -65,9 +64,13 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
    }
    @Override
-    public boolean index() throws IngesterException {
+    public boolean index(AbstractFile sourceFile) throws IngesterException {
        this.sourceFile = sourceFile;
        this.numChunks = 0; //unknown until indexing is done
        boolean success = false;
        Reader reader = null;
        final InputStream stream = new ReadContentInputStream(sourceFile);
        try {
@ -173,4 +176,20 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
        return success;
    }
    @Override
    public boolean isContentTypeSpecific() {
        return true;
    }
    @Override
    public boolean isSupported(AbstractFile file) {
        String fileNameLower = file.getName().toLowerCase();
        for (int i = 0; i< SUPPORTED_EXTENSIONS.length; ++i) {
            if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
                return true;
            }
        }
        return false;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
@ -40,24 +40,27 @@ class AbstractFileStringExtract implements AbstractFileExtract {
    private int numChunks;
    private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
    static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
-    private AbstractFile aFile;
+    private AbstractFile sourceFile;
    //single static buffer for all extractions.  Safe, indexing can only happen in one thread
    private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
    private static final int BOM_LEN = 3;
    //private static final StringExtract se = new StringExtract();
    static {
        //prepend UTF-8 BOM to start of the buffer
        STRING_CHUNK_BUF[0] = (byte) 0xEF;
        STRING_CHUNK_BUF[1] = (byte) 0xBB;
        STRING_CHUNK_BUF[2] = (byte) 0xBF;
        //se.init();
    }
-    public AbstractFileStringExtract(AbstractFile aFile) {
+    public AbstractFileStringExtract() {
        this.aFile = aFile;
        numChunks = 0; //unknown until indexing is done
        this.service = KeywordSearchIngestService.getDefault();
-        Server solrServer = KeywordSearch.getServer();
+        ingester = Server.getIngester();
        ingester = solrServer.getIngester();
    }
    @Override
@ -67,15 +70,17 @@ class AbstractFileStringExtract implements AbstractFileExtract {
    @Override
    public AbstractFile getSourceFile() {
-        return aFile;
+        return sourceFile;
    }
    @Override
-    public boolean index() throws IngesterException {
+    public boolean index(AbstractFile sourceFile) throws IngesterException {
        this.sourceFile = sourceFile;
        this.numChunks = 0; //unknown until indexing is done
        boolean success = false;
-
+        
        //construct stream that extracts text as we read it
-        final InputStream stringStream = new AbstractFileStringStream(aFile, ByteContentStream.Encoding.UTF8);
+        final InputStream stringStream = new AbstractFileStringStream(sourceFile, ByteContentStream.Encoding.UTF8);
        try {
            success = true;
@ -93,7 +98,7 @@ class AbstractFileStringExtract implements AbstractFileExtract {
                    ++this.numChunks;
                } catch (IngesterException ingEx) {
                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ingEx);
+                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
                    throw ingEx; //need to rethrow/return to signal error and move on
                }
@ -109,19 +114,31 @@ class AbstractFileStringExtract implements AbstractFileExtract {
            ingester.ingest(this);
        } catch (IOException ex) {
-            logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + aFile.getName(), ex);
+            logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex);
            success = false;
        } finally {
            try {
                stringStream.close();
            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Error closing input stream stream, file: " + aFile.getName(), ex);
+                logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex);
            }
        }
        return success;
    }
    @Override
    public boolean isContentTypeSpecific() {
        return false;
    }
    @Override
    public boolean isSupported(AbstractFile file) {
        return true;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
@ -42,11 +42,12 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 /**
 * Extractor of text from TIKA supported AbstractFile content. Extracted text is
- * divided into chunks and indexed with Solr.
+ * divided into chunks and indexed with Solr. Protects against Tika parser hangs
- * Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism.
+ * (for unexpected/corrupt content) using a timeout mechanism. If Tika
- * If Tika extraction succeeds, chunks are indexed with Solr.
+ * extraction succeeds, chunks are indexed with Solr.
 *
- * This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type.
+ * This Tika extraction/chunking utility is useful for large files of Tika
 * parsers-supported content type.
 *
 */
 public class AbstractFileTikaTextExtract implements AbstractFileExtract {
@ -58,19 +59,24 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    private static final int SINGLE_READ_CHARS = 1024;
    private static final int EXTRA_CHARS = 128; //for whitespace
    private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
-    private static final Tika tika = new Tika();
+    private Tika tika;
    private KeywordSearchIngestService service;
-    private Ingester ingester;
+    private static Ingester ingester;
-    private AbstractFile sourceFile;
+    private AbstractFile sourceFile; //currently processed file
    private int numChunks = 0;
    private static final String UTF16BOM = "\uFEFF";
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
    // TODO: use a more robust method than checking file extension
    // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
    static final String[] SUPPORTED_EXTENSIONS = {"tar", "jar", "zip", "gzip", "bzip2",
        "gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
        "bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
        "pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
-    AbstractFileTikaTextExtract(AbstractFile sourceFile) {
+    AbstractFileTikaTextExtract() {
        this.sourceFile = sourceFile;
        this.service = KeywordSearchIngestService.getDefault();
-        Server solrServer = KeywordSearch.getServer();
+        ingester = Server.getIngester();
-        ingester = solrServer.getIngester();
+        tika = new Tika();
        //tika.setMaxStringLength(MAX_EXTR_TEXT_CHARS); //for getting back string only
    }
@ -85,7 +91,10 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    }
    @Override
-    public boolean index() throws Ingester.IngesterException {
+    public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
        this.sourceFile = sourceFile;
        this.numChunks = 0; //unknown until indexing is done
        boolean success = false;
        Reader reader = null;
@ -94,30 +103,30 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
        try {
            Metadata meta = new Metadata();
            /* Tika parse request with timeout -- disabled for now
-            ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
+             ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
-            final Future<?> future = tikaParseExecutor.submit(parseTask);
+             final Future<?> future = tikaParseExecutor.submit(parseTask);
            try {
                future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
            } catch (TimeoutException te) {
                final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
                logger.log(Level.WARNING, msg);
                throw new IngesterException(msg);
            }
            catch (Exception ex) {
                final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
                logger.log(Level.WARNING, msg, ex);
                throw new IngesterException(msg);
            }
            reader = parseTask.getReader();
            */
             try {
             future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
             } catch (TimeoutException te) {
             final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
             logger.log(Level.WARNING, msg);
             throw new IngesterException(msg);
             }
             catch (Exception ex) {
             final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
             logger.log(Level.WARNING, msg, ex);
             throw new IngesterException(msg);
             }
             reader = parseTask.getReader();
             */
            try {
                reader = tika.parse(stream, meta);
            } catch (IOException ex) {
                logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
                reader = null;
            }
-            
+
            if (reader == null) {
                //likely due to exception in parse()
                logger.log(Level.WARNING, "No reader available from Tika parse");
@ -230,8 +239,25 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
        return success;
    }
    @Override
    public boolean isContentTypeSpecific() {
        return true;
    }
    @Override
    public boolean isSupported(AbstractFile file) {
        String fileNameLower = file.getName().toLowerCase();
        for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) {
            if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
                return true;
            }
        }
        return false;
    }
    /**
-     * Runnable and timeable task that calls tika to parse the content using streaming
+     * Runnable and timeable task that calls tika to parse the content using
     * streaming
     */
    private static class ParseRequestTask implements Runnable {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -65,12 +65,6 @@ public class Ingester {
    private final ExecutorService upRequestExecutor = Executors.newSingleThreadExecutor();
    private final Server solrServer = KeywordSearch.getServer();
    private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
    // TODO: use a more robust method than checking file extension
    // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
    static final String[] ingestibleExtensions = {"tar", "jar", "zip", "gzip", "bzip2",
        "gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
        "bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
        "pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
    private static Ingester instance;
@ -452,29 +446,4 @@ public class Ingester {
        }
    }
    /**
     * Determine if the file content is ingestible/indexable by keyword search
     * Ingestible abstract file is either a directory, or an allocated file with supported extensions.
     * Note: currently only checks by extension and abstract type, it does not check actual file content.
     * @param aFile
     * @return true if it is ingestible, false otherwise
     */
    static boolean isIngestible(AbstractFile aFile) {
        TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
        if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) ) {
                return false;
        }
        FsContent fsContent = (FsContent) aFile;
        boolean isIngestible = false;
        final String fileName = fsContent.getName();
        for (final String ext : ingestibleExtensions) {
            if (fileName.toLowerCase().endsWith(ext)) {
                isIngestible = true;
                break;
            }
        }
        return isIngestible;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
@ -95,6 +95,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    private SleuthkitCase caseHandle = null;
    private boolean skipKnown = true;
    private boolean initialized = false;
    private List<AbstractFileExtract> textExtractors;
    private AbstractFileStringExtract stringExtractor;
    private enum IngestStatus {
@ -256,9 +258,15 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
        this.managerProxy = managerProxy;
-        Server solrServer = KeywordSearch.getServer();
+        ingester = Server.getIngester();
        //initialize extractors
        stringExtractor = new AbstractFileStringExtract();
        textExtractors = new ArrayList<AbstractFileExtract>();
        //order matters, more specific extractors first
        textExtractors.add(new AbstractFileHtmlExtract());
        textExtractors.add(new AbstractFileTikaTextExtract());
        ingester = solrServer.getIngester();
        ingestStatus = new HashMap<Long, IngestStatus>();
@ -505,21 +513,44 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
         *
         * @param aFile file to extract strings from, divide into chunks and
         * index
-         * @param stringsOnly true if use stinrg extraction, false if use Tika
+         * @param stringsOnly true if use string extraction, false if to use a
-         * text extractor
+         * content-type specific text extractor
         * @return true if the file was indexed, false otherwise
         * @throws IngesterException exception thrown if indexing failed
         */
        private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
-            AbstractFileExtract fileExtract;
+            AbstractFileExtract fileExtract = null;
            if (stringsOnly) {
-                fileExtract = new AbstractFileStringExtract(aFile);
+                fileExtract = stringExtractor;
            } else {
-                fileExtract = new AbstractFileTikaTextExtract(aFile);
+                //go over available text extractors and pick the first one (most specific one)
                for (AbstractFileExtract fe : textExtractors) {
                    if (fe.isSupported(aFile)) {
                        fileExtract = fe;
                        break;
                    }
                }
            }
            if (fileExtract == null) {
                throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
            }
            //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
            //divide into chunks and index
-            return fileExtract.index();
+            return fileExtract.index(aFile);
        }
        private boolean isTextExtractSupported(AbstractFile aFile) {
            for (AbstractFileExtract extractor : textExtractors) {
                if (extractor.isContentTypeSpecific() == true
                        && extractor.isSupported(aFile)) {
                    return true;
                }
            }
            return false;
        }
        private void indexFile(AbstractFile aFile, boolean indexContent) {
@ -547,11 +578,10 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
                return;
            }
-            boolean ingestibleFile = Ingester.isIngestible(aFile);
+            boolean extractTextSupported = isTextExtractSupported(aFile);
-
+            if (fsContent != null && extractTextSupported) {
-            if (fsContent != null && ingestibleFile == true) {
+                //we know it's an allocated FS file (since it's FsContent)
-                //we know it's an allocated fs file (FsContent) with supported content
+                //extract text with one of the extractors, divide into chunks and index with Solr
                //extract text with Tika, divide into chunks and index with Solr
                try {
                    //logger.log(Level.INFO, "indexing: " + fsContent.getName());
                    if (!extractIndex(aFile, false)) {
@ -564,7 +594,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
                    } else {
                        ingestStatus.put(aFile.getId(), IngestStatus.INGESTED);
                    }
                } catch (IngesterException e) {
@ -715,13 +744,13 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
                    for (String termResult : queryResult.keySet()) {
                        List<ContentHit> queryTermResults = queryResult.get(termResult);
-                        
+
                        //translate to list of IDs that we keep track of
                        List<Long> queryTermResultsIDs = new ArrayList<Long>();
                        for (ContentHit ch : queryTermResults) {
                            queryTermResultsIDs.add(ch.getId());
                        }
-                        
+
                        Keyword termResultK = new Keyword(termResult, !isRegex);
                        List<Long> curTermResults = currentResults.get(termResultK);
                        if (curTermResults == null) {
@ -938,7 +967,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
        }
    }
    /**
     * Set the skip known files setting on the service
     *