Generalize text extractors more so we support multiple extractors in keyword search that are ordered from more to less specific ones.

Integrate html text extractor into keyword search.
2025-07-12 16:06:15 +00:00 · 2012-07-25 12:19:32 -04:00 · 2012-07-25 12:19:32 -04:00 · 27e04f16d1
commit 27e04f16d1
parent ca87852431
6 changed files with 173 additions and 98 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java
@ -41,8 +41,24 @@ interface AbstractFileExtract {

    /**
     * Index the Abstract File
+     * @param sourceFile file to index
     * @return true if indexed successfully, false otherwise
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException 
     */
-    boolean index() throws Ingester.IngesterException;
+    boolean index(AbstractFile sourceFile) throws Ingester.IngesterException;
+    
+    /**
+     * Determines if the extractor works only for specified types
+     * is supportedTypes() or whether is a generic content extractor (such as string extractor)
+     * @return 
+     */
+    boolean isContentTypeSpecific();
+    
+    /**
+     * Determines if the file content is supported by the extractor, 
+     * if isContentTypeSpecific() returns true.
+     * @param file to test if its content should be supported
+     * @return true if the file content is supported, false otherwise
+     */
+    boolean isSupported(AbstractFile file);
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java
@ -46,12 +46,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
    private AbstractFile sourceFile;
    private int numChunks = 0;
    private static final String UTF16BOM = "\uFEFF";
+    private static final String [] SUPPORTED_EXTENSIONS = {"htm", "html", "xhtml", "css", "js"};
    
-    AbstractFileHtmlExtract(AbstractFile sourceFile) {
-        this.sourceFile = sourceFile;
+    AbstractFileHtmlExtract() {
        this.service = KeywordSearchIngestService.getDefault();
-        Server solrServer = KeywordSearch.getServer();
-        ingester = solrServer.getIngester();
+        ingester = Server.getIngester();
    }

    @Override
@ -65,9 +64,13 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
    }

    @Override
-    public boolean index() throws IngesterException {
+    public boolean index(AbstractFile sourceFile) throws IngesterException {
+        this.sourceFile = sourceFile;
+        this.numChunks = 0; //unknown until indexing is done
+        
        boolean success = false;
        Reader reader = null;
+        
        final InputStream stream = new ReadContentInputStream(sourceFile);
        
        try {
@ -173,4 +176,20 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
        return success;
    }
    
+    @Override
+    public boolean isContentTypeSpecific() {
+        return true;
+    }
+
+    @Override
+    public boolean isSupported(AbstractFile file) {
+        String fileNameLower = file.getName().toLowerCase();
+        for (int i = 0; i< SUPPORTED_EXTENSIONS.length; ++i) {
+            if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
+                return true;
+            }
+        }
+        return false;
+    }
+    
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
@ -40,24 +40,27 @@ class AbstractFileStringExtract implements AbstractFileExtract {
    private int numChunks;
    private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
    static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
-    private AbstractFile aFile;
+    private AbstractFile sourceFile;
    //single static buffer for all extractions.  Safe, indexing can only happen in one thread
    private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
    private static final int BOM_LEN = 3;
+    
+    //private static final StringExtract se = new StringExtract();

    static {
        //prepend UTF-8 BOM to start of the buffer
        STRING_CHUNK_BUF[0] = (byte) 0xEF;
        STRING_CHUNK_BUF[1] = (byte) 0xBB;
        STRING_CHUNK_BUF[2] = (byte) 0xBF;
+        
+        //se.init();
+        
+        
    }

-    public AbstractFileStringExtract(AbstractFile aFile) {
-        this.aFile = aFile;
-        numChunks = 0; //unknown until indexing is done
+    public AbstractFileStringExtract() {
        this.service = KeywordSearchIngestService.getDefault();
-        Server solrServer = KeywordSearch.getServer();
-        ingester = solrServer.getIngester();
+        ingester = Server.getIngester();
    }

    @Override
@ -67,15 +70,17 @@ class AbstractFileStringExtract implements AbstractFileExtract {

    @Override
    public AbstractFile getSourceFile() {
-        return aFile;
+        return sourceFile;
    }

    @Override
-    public boolean index() throws IngesterException {
+    public boolean index(AbstractFile sourceFile) throws IngesterException {
+        this.sourceFile = sourceFile;
+        this.numChunks = 0; //unknown until indexing is done
        boolean success = false;
-
+        
        //construct stream that extracts text as we read it
-        final InputStream stringStream = new AbstractFileStringStream(aFile, ByteContentStream.Encoding.UTF8);
+        final InputStream stringStream = new AbstractFileStringStream(sourceFile, ByteContentStream.Encoding.UTF8);

        try {
            success = true;
@ -93,7 +98,7 @@ class AbstractFileStringExtract implements AbstractFileExtract {
                    ++this.numChunks;
                } catch (IngesterException ingEx) {
                    success = false;
-                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ingEx);
+                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
                    throw ingEx; //need to rethrow/return to signal error and move on
                }

@ -109,19 +114,31 @@ class AbstractFileStringExtract implements AbstractFileExtract {
            ingester.ingest(this);

        } catch (IOException ex) {
-            logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + aFile.getName(), ex);
+            logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex);
            success = false;
        } finally {
            try {
                stringStream.close();
            } catch (IOException ex) {
-                logger.log(Level.WARNING, "Error closing input stream stream, file: " + aFile.getName(), ex);
+                logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex);
            }
        }


        return success;
    }
+
+    @Override
+    public boolean isContentTypeSpecific() {
+        return false;
+    }
+
+    @Override
+    public boolean isSupported(AbstractFile file) {
+        return true;
+    }
+    
+    
    

 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
@ -42,11 +42,12 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;

 /**
 * Extractor of text from TIKA supported AbstractFile content. Extracted text is
- * divided into chunks and indexed with Solr.
- * Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism.
- * If Tika extraction succeeds, chunks are indexed with Solr.
+ * divided into chunks and indexed with Solr. Protects against Tika parser hangs
+ * (for unexpected/corrupt content) using a timeout mechanism. If Tika
+ * extraction succeeds, chunks are indexed with Solr.
 *
- * This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type.
+ * This Tika extraction/chunking utility is useful for large files of Tika
+ * parsers-supported content type.
 *
 */
 public class AbstractFileTikaTextExtract implements AbstractFileExtract {
@ -58,19 +59,24 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    private static final int SINGLE_READ_CHARS = 1024;
    private static final int EXTRA_CHARS = 128; //for whitespace
    private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
-    private static final Tika tika = new Tika();
+    private Tika tika;
    private KeywordSearchIngestService service;
-    private Ingester ingester;
-    private AbstractFile sourceFile;
+    private static Ingester ingester;
+    private AbstractFile sourceFile; //currently processed file
    private int numChunks = 0;
    private static final String UTF16BOM = "\uFEFF";
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
+    // TODO: use a more robust method than checking file extension
+    // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
+    static final String[] SUPPORTED_EXTENSIONS = {"tar", "jar", "zip", "gzip", "bzip2",
+        "gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
+        "bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
+        "pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};

-    AbstractFileTikaTextExtract(AbstractFile sourceFile) {
-        this.sourceFile = sourceFile;
+    AbstractFileTikaTextExtract() {
        this.service = KeywordSearchIngestService.getDefault();
-        Server solrServer = KeywordSearch.getServer();
-        ingester = solrServer.getIngester();
+        ingester = Server.getIngester();
+        tika = new Tika();
        //tika.setMaxStringLength(MAX_EXTR_TEXT_CHARS); //for getting back string only
    }

@ -85,7 +91,10 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    }

    @Override
-    public boolean index() throws Ingester.IngesterException {
+    public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
+        this.sourceFile = sourceFile;
+        this.numChunks = 0; //unknown until indexing is done
+        
        boolean success = false;
        Reader reader = null;

@ -94,30 +103,30 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
        try {
            Metadata meta = new Metadata();
            /* Tika parse request with timeout -- disabled for now
-            ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
-            final Future<?> future = tikaParseExecutor.submit(parseTask);
-            try {
-                future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
-            } catch (TimeoutException te) {
-                final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
-                logger.log(Level.WARNING, msg);
-                throw new IngesterException(msg);
-            }
-            catch (Exception ex) {
-                final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
-                logger.log(Level.WARNING, msg, ex);
-                throw new IngesterException(msg);
-            }
-            
-            reader = parseTask.getReader();
-            */
+             ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
+             final Future<?> future = tikaParseExecutor.submit(parseTask);
             try {
+             future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
+             } catch (TimeoutException te) {
+             final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
+             logger.log(Level.WARNING, msg);
+             throw new IngesterException(msg);
+             }
+             catch (Exception ex) {
+             final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
+             logger.log(Level.WARNING, msg, ex);
+             throw new IngesterException(msg);
+             }
+            
+             reader = parseTask.getReader();
+             */
+            try {
                reader = tika.parse(stream, meta);
            } catch (IOException ex) {
                logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
                reader = null;
            }
-            
+
            if (reader == null) {
                //likely due to exception in parse()
                logger.log(Level.WARNING, "No reader available from Tika parse");
@ -230,8 +239,25 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
        return success;
    }

+    @Override
+    public boolean isContentTypeSpecific() {
+        return true;
+    }
+
+    @Override
+    public boolean isSupported(AbstractFile file) {
+        String fileNameLower = file.getName().toLowerCase();
+        for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) {
+            if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    /**
-     * Runnable and timeable task that calls tika to parse the content using streaming
+     * Runnable and timeable task that calls tika to parse the content using
+     * streaming
     */
    private static class ParseRequestTask implements Runnable {

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -65,12 +65,6 @@ public class Ingester {
    private final ExecutorService upRequestExecutor = Executors.newSingleThreadExecutor();
    private final Server solrServer = KeywordSearch.getServer();
    private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
-    // TODO: use a more robust method than checking file extension
-    // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
-    static final String[] ingestibleExtensions = {"tar", "jar", "zip", "gzip", "bzip2",
-        "gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
-        "bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
-        "pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};


    private static Ingester instance;
@ -452,29 +446,4 @@ public class Ingester {
        }
    }

-    /**
-     * Determine if the file content is ingestible/indexable by keyword search
-     * Ingestible abstract file is either a directory, or an allocated file with supported extensions.
-     * Note: currently only checks by extension and abstract type, it does not check actual file content.
-     * @param aFile
-     * @return true if it is ingestible, false otherwise
-     */
-    static boolean isIngestible(AbstractFile aFile) {
-        TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
-        if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) ) {
-                return false;
-        }
-        
-        FsContent fsContent = (FsContent) aFile;
-        
-        boolean isIngestible = false;
-        final String fileName = fsContent.getName();
-        for (final String ext : ingestibleExtensions) {
-            if (fileName.toLowerCase().endsWith(ext)) {
-                isIngestible = true;
-                break;
-            }
-        }
-        return isIngestible;
-    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
@ -95,6 +95,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    private SleuthkitCase caseHandle = null;
    private boolean skipKnown = true;
    private boolean initialized = false;
+    private List<AbstractFileExtract> textExtractors;
+    private AbstractFileStringExtract stringExtractor;

    private enum IngestStatus {

@ -256,9 +258,15 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

        this.managerProxy = managerProxy;

-        Server solrServer = KeywordSearch.getServer();
+        ingester = Server.getIngester();
+
+        //initialize extractors
+        stringExtractor = new AbstractFileStringExtract();
+        textExtractors = new ArrayList<AbstractFileExtract>();
+        //order matters, more specific extractors first
+        textExtractors.add(new AbstractFileHtmlExtract());
+        textExtractors.add(new AbstractFileTikaTextExtract());

-        ingester = solrServer.getIngester();

        ingestStatus = new HashMap<Long, IngestStatus>();

@ -505,21 +513,44 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
         *
         * @param aFile file to extract strings from, divide into chunks and
         * index
-         * @param stringsOnly true if use stinrg extraction, false if use Tika
-         * text extractor
+         * @param stringsOnly true if use string extraction, false if to use a
+         * content-type specific text extractor
         * @return true if the file was indexed, false otherwise
+         * @throws IngesterException exception thrown if indexing failed
         */
        private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
-            AbstractFileExtract fileExtract;
+            AbstractFileExtract fileExtract = null;

            if (stringsOnly) {
-                fileExtract = new AbstractFileStringExtract(aFile);
+                fileExtract = stringExtractor;
            } else {
-                fileExtract = new AbstractFileTikaTextExtract(aFile);
+                //go over available text extractors and pick the first one (most specific one)
+                for (AbstractFileExtract fe : textExtractors) {
+                    if (fe.isSupported(aFile)) {
+                        fileExtract = fe;
+                        break;
+                    }
+                }
            }

+            if (fileExtract == null) {
+                throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
+            }
+
+            //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
+
            //divide into chunks and index
-            return fileExtract.index();
+            return fileExtract.index(aFile);
+        }
+
+        private boolean isTextExtractSupported(AbstractFile aFile) {
+            for (AbstractFileExtract extractor : textExtractors) {
+                if (extractor.isContentTypeSpecific() == true
+                        && extractor.isSupported(aFile)) {
+                    return true;
+                }
+            }
+            return false;
        }

        private void indexFile(AbstractFile aFile, boolean indexContent) {
@ -547,11 +578,10 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
                return;
            }

-            boolean ingestibleFile = Ingester.isIngestible(aFile);
-
-            if (fsContent != null && ingestibleFile == true) {
-                //we know it's an allocated fs file (FsContent) with supported content
-                //extract text with Tika, divide into chunks and index with Solr
+            boolean extractTextSupported = isTextExtractSupported(aFile);
+            if (fsContent != null && extractTextSupported) {
+                //we know it's an allocated FS file (since it's FsContent)
+                //extract text with one of the extractors, divide into chunks and index with Solr
                try {
                    //logger.log(Level.INFO, "indexing: " + fsContent.getName());
                    if (!extractIndex(aFile, false)) {
@ -564,7 +594,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

                    } else {
                        ingestStatus.put(aFile.getId(), IngestStatus.INGESTED);
-
                    }

                } catch (IngesterException e) {
@ -715,13 +744,13 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

                    for (String termResult : queryResult.keySet()) {
                        List<ContentHit> queryTermResults = queryResult.get(termResult);
-                        
+
                        //translate to list of IDs that we keep track of
                        List<Long> queryTermResultsIDs = new ArrayList<Long>();
                        for (ContentHit ch : queryTermResults) {
                            queryTermResultsIDs.add(ch.getId());
                        }
-                        
+
                        Keyword termResultK = new Keyword(termResult, !isRegex);
                        List<Long> curTermResults = currentResults.get(termResultK);
                        if (curTermResults == null) {
@ -938,7 +967,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
        }
    }

-
    /**
     * Set the skip known files setting on the service
     *