Always index meta data of known files (skip content), and 0 byte files

2025-07-11 23:46:15 +00:00 · 2012-06-28 13:34:08 -04:00 · 2012-06-28 13:34:08 -04:00 · ed9dceb502
commit ed9dceb502
parent 8ba8775931
4 changed files with 117 additions and 76 deletions
--- a/Ingest/src/org/sleuthkit/autopsy/ingest/GetAllFilesContentVisitor.java
+++ b/Ingest/src/org/sleuthkit/autopsy/ingest/GetAllFilesContentVisitor.java
@ -71,7 +71,7 @@ class GetAllFilesContentVisitor extends GetFilesContentVisitor {

        StringBuilder queryB = new StringBuilder();
        queryB.append("SELECT * FROM tsk_files WHERE ( (fs_obj_id = ").append(fs.getId());
-        queryB.append(") OR (fs_obj_id = NULL) ) AND (size > 0)");
+        queryB.append(") OR (fs_obj_id = NULL) )");
        queryB.append(" AND ( (meta_type = ").append(TskData.TSK_FS_META_TYPE_ENUM.TSK_FS_META_TYPE_REG.getMetaType());
        queryB.append(") OR (meta_type = ").append(TskData.TSK_FS_META_TYPE_ENUM.TSK_FS_META_TYPE_DIR.getMetaType());
        queryB.append( " AND (name != '.') AND (name != '..')");
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java
@ -365,6 +365,8 @@ public class ExtractedContentViewer implements DataContentViewer {
            return false;
        }

+        if (content.getSize() == 0)
+            return false;

        final Server solrServer = KeywordSearch.getServer();

@ -375,8 +377,6 @@ public class ExtractedContentViewer implements DataContentViewer {

        final long contentID = content.getId();

-
-
        try {
            return solrServer.queryIsIndexed(contentID);
        } catch (NoOpenCoreException ex) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -139,13 +139,15 @@ public class Ingester {
    /**
     * Sends a file to Solr to have its content extracted and added to the
     * index. commit() should be called once you're done ingesting files.
+     * If the file is a directory or ingestContent is set to false, the file name is indexed only.
     * 
-     * @param f File to ingest
+     * @param fsContent File to ingest
+     * @param ingestContent if true, index the file and the content, otherwise indesx metadata only
     * @throws IngesterException if there was an error processing a specific
     * file, but the Solr server is probably fine.
     */
-    void ingest(FsContent fsContent) throws IngesterException {
-        if (fsContent.isDir() ) {
+    void ingest(FsContent fsContent, boolean ingestContent) throws IngesterException {
+        if (fsContent.isDir() || ingestContent == false ) {
            ingest(new NullContentStream(fsContent), getContentFields(fsContent), 0);
        }
        else {
@ -438,25 +440,20 @@ public class Ingester {
    }

    /**
-     * Determine if the file is ingestible/indexable by keyword search
+     * Determine if the file content is ingestible/indexable by keyword search
     * Ingestible abstract file is either a directory, or an allocated file with supported extensions.
     * Note: currently only checks by extension and abstract type, it does not check actual file content.
     * @param aFile
     * @return true if it is ingestible, false otherwise
     */
    static boolean isIngestible(AbstractFile aFile) {
-        boolean isIngestible = false;
-        
        TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
-        if (aType.equals(TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
-                || aType.equals(TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
-                return isIngestible;
+        if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) )
+                return false;
        
        FsContent fsContent = (FsContent) aFile;
-        if (fsContent.isDir())
-            //we index dir name, not content
-            return true;
        
+        boolean isIngestible = false;
        final String fileName = fsContent.getName();
        for (final String ext : ingestibleExtensions) {
            if (fileName.toLowerCase().endsWith(ext)) {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
@ -38,6 +38,7 @@ import org.apache.solr.client.solrj.SolrServerException;
 import org.netbeans.api.progress.ProgressHandle;
 import org.netbeans.api.progress.ProgressHandleFactory;
 import org.openide.util.Cancellable;
+import org.openide.util.Exceptions;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.ingest.IngestManager;
 import org.sleuthkit.autopsy.ingest.IngestManagerProxy;
@ -55,14 +56,13 @@ import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskData;

 /**
- * An ingest service on a file level
- * Performs indexing of allocated and Solr supported files,
- * string extraction and indexing of unallocated and not Solr supported files
- * Index commit is done periodically (determined by user set ingest update interval)
- * Runs a periodic keyword / regular expression search on currently configured lists for ingest
- * and writes results to blackboard
+ * An ingest service on a file level Performs indexing of allocated and Solr
+ * supported files, string extraction and indexing of unallocated and not Solr
+ * supported files Index commit is done periodically (determined by user set
+ * ingest update interval) Runs a periodic keyword / regular expression search
+ * on currently configured lists for ingest and writes results to blackboard
 * Reports interesting events to Inbox and to viewers
- * 
+ *
 * Registered as a service in layer.xml
 */
 public final class KeywordSearchIngestService implements IngestServiceAbstractFile {
@ -92,19 +92,20 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    private volatile int messageID = 0;
    private boolean processedFiles;
    private volatile boolean finalSearcherDone = true;
-    private final String hashDBServiceName = "Hash Lookup";
+    private final String hashDBServiceName = "Hash Lookup"; //NOTE this needs to match the HashDB service getName()
    private SleuthkitCase caseHandle = null;
    private boolean skipKnown = true;
    boolean initialized = false;

    private enum IngestStatus {

-        INGESTED, EXTRACTED_INGESTED, SKIPPED,
+        INGESTED, EXTRACTED_INGESTED, SKIPPED, INGESTED_META
    };
    private Map<Long, IngestStatus> ingestStatus;

    /**
     * Returns singleton instance of the service, creates one if needed
+     *
     * @return instance of the service
     */
    public static synchronized KeywordSearchIngestService getDefault() {
@ -115,10 +116,12 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * Starts processing of every file provided by IngestManager.  
-     * Checks if it is time to commit and run search
+     * Starts processing of every file provided by IngestManager. Checks if it
+     * is time to commit and run search
+     *
     * @param abstractFile file/unallocated file/directory to process
-     * @return ProcessResult.OK in most cases and ERROR only if error in the pipeline, otherwise does not advice to stop the pipeline
+     * @return ProcessResult.OK in most cases and ERROR only if error in the
+     * pipeline, otherwise does not advice to stop the pipeline
     */
    @Override
    public ProcessResult process(AbstractFile abstractFile) {
@ -133,8 +136,12 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
        IngestServiceAbstractFile.ProcessResult hashDBResult = managerProxy.getAbstractFileServiceResult(hashDBServiceName);
        //logger.log(Level.INFO, "hashdb result: " + hashDBResult + "file: " + AbstractFile.getName());
        if (hashDBResult == IngestServiceAbstractFile.ProcessResult.COND_STOP && skipKnown) {
+            //index meta-data only
+            indexer.indexFile(abstractFile, false);
            return ProcessResult.OK;
        } else if (hashDBResult == IngestServiceAbstractFile.ProcessResult.ERROR) {
+            //index meta-data only
+            indexer.indexFile(abstractFile, false);
            //notify depending service that keyword search (would) encountered error for this file
            return ProcessResult.ERROR;
        }
@ -145,7 +152,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

        checkRunCommitSearch();

-        indexer.indexFile(abstractFile);
+        //index the file and content (if the content is supported)
+        indexer.indexFile(abstractFile, true);
        return ProcessResult.OK;

    }
@ -196,8 +204,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * Handle stop event (ingest interrupted)
-     * Cleanup resources, threads, timers
+     * Handle stop event (ingest interrupted) Cleanup resources, threads, timers
     */
    @Override
    public void stop() {
@ -234,9 +241,10 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * Initializes the service for new ingest run
-     * Sets up threads, timers, retrieves settings, keyword lists to run on
-     * @param managerProxy 
+     * Initializes the service for new ingest run Sets up threads, timers,
+     * retrieves settings, keyword lists to run on
+     *
+     * @param managerProxy
     */
    @Override
    public void init(IngestManagerProxy managerProxy) {
@ -320,9 +328,11 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * The services maintains background threads, return true if background threads are running
-     * or there are pending tasks to be run in the future, such as the final search post-ingest completion
-     * @return 
+     * The services maintains background threads, return true if background
+     * threads are running or there are pending tasks to be run in the future,
+     * such as the final search post-ingest completion
+     *
+     * @return
     */
    @Override
    public boolean hasBackgroundJobsRunning() {
@ -353,6 +363,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
     */
    private void postIndexSummary() {
        int indexed = 0;
+        int indexed_meta = 0;
        int indexed_extr = 0;
        int skipped = 0;
        for (IngestStatus s : ingestStatus.values()) {
@ -360,6 +371,9 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
                case INGESTED:
                    ++indexed;
                    break;
+                case INGESTED_META:
+                    ++indexed_meta;
+                    break;
                case EXTRACTED_INGESTED:
                    ++indexed_extr;
                    break;
@ -373,6 +387,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

        StringBuilder msg = new StringBuilder();
        msg.append("Indexed files: ").append(indexed).append("<br />Indexed strings: ").append(indexed_extr);
+        msg.append("<br />Indexed meta-data only: ").append(indexed_meta).append("<br />");
        msg.append("<br />Skipped files: ").append(skipped).append("<br />");
        String indexStats = msg.toString();
        logger.log(Level.INFO, "Keyword Indexing Completed: " + indexStats);
@ -423,8 +438,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * Check if time to commit, if so, run commit.
-     * Then run search if search timer is also set.
+     * Check if time to commit, if so, run commit. Then run search if search
+     * timer is also set.
     */
    void checkRunCommitSearch() {
        if (commitIndex) {
@ -446,8 +461,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * CommitTimerAction to run by commitTimer
-     * Sets a flag to indicate we are ready for commit
+     * CommitTimerAction to run by commitTimer Sets a flag to indicate we are
+     * ready for commit
     */
    private class CommitTimerAction implements ActionListener {

@ -461,8 +476,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * SearchTimerAction to run by searchTimer
-     * Sets a flag to indicate we are ready to search
+     * SearchTimerAction to run by searchTimer Sets a flag to indicate we are
+     * ready to search
     */
    private class SearchTimerAction implements ActionListener {

@ -477,7 +492,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

    /**
     * File indexer, processes and indexes known/allocated files,
-     * unknown/unallocated files and directories accordingly 
+     * unknown/unallocated files and directories accordingly
     */
    private class Indexer {

@ -495,42 +510,70 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
            return indexed;
        }

-        private void indexFile(AbstractFile aFile) {
+        private void indexFile(AbstractFile aFile, boolean indexContent) {
            //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
-            boolean ingestibleFile = Ingester.isIngestible(aFile);

-            final long size = aFile.getSize();
-            //limit size of entire file, do not limit strings
-            if (size == 0 || (ingestibleFile && size > MAX_INDEX_SIZE)) {
-                ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+            FsContent fsContent = null;
+            //check if alloc fs file or dir
+            TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
+            if (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.FS)) {
+                fsContent = (FsContent) aFile;
+            }
+
+            //if alloc fs file and not index content, or a dir, index meta data only
+            if (fsContent != null
+                    && (indexContent == false || fsContent.isDir())) {
+                try {
+                    ingester.ingest(fsContent, false); //meta-data only
+                    ingestStatus.put(aFile.getId(), IngestStatus.INGESTED_META);
+                } catch (IngesterException ex) {
+                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                    logger.log(Level.WARNING, "Unable to index meta-data for fsContent: " + fsContent.getId(), ex);
+                }
+
                return;
            }

-            if (ingestibleFile == true) {
-                //we know it's an allocated file or dir (FsContent)
-                FsContent fileDir = (FsContent) aFile;
+            boolean ingestibleFile = Ingester.isIngestible(aFile);
+
+            final long size = aFile.getSize();
+            //if fs file, limit size of entire file, do not limit strings
+            if (fsContent != null && (size == 0 || (ingestibleFile && size > MAX_INDEX_SIZE))) {
+                //if fs file, index meta only, otherwise if unalloc, skip
+                try {
+                    ingester.ingest(fsContent, false); //meta-data only
+                    ingestStatus.put(aFile.getId(), IngestStatus.INGESTED_META);
+                } catch (IngesterException ex) {
+                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                    logger.log(Level.WARNING, "Unable to index meta-data for fsContent: " + fsContent.getId(), ex);
+                }
+
+                return;
+            }
+
+            if (fsContent != null && ingestibleFile == true) {
+                //we know it's an allocated fs file (FsContent) with supported content 
                try {
                    //logger.log(Level.INFO, "indexing: " + fsContent.getName());
-                    ingester.ingest(fileDir);
-                    ingestStatus.put(fileDir.getId(), IngestStatus.INGESTED);
+                    ingester.ingest(fsContent, true);
+                    ingestStatus.put(fsContent.getId(), IngestStatus.INGESTED);
                } catch (IngesterException e) {
-                    ingestStatus.put(fileDir.getId(), IngestStatus.SKIPPED);
-                    //try to extract strings if not a dir
-                    if (fileDir.isFile() == true) {
-                        processNonIngestible(fileDir);
+                    ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
+                    //try to extract strings, if a file
+                    if (fsContent.isFile() == true) {
+                        processNonIngestible(fsContent);
                    }

                } catch (Exception e) {
-                    ingestStatus.put(fileDir.getId(), IngestStatus.SKIPPED);
-                    //try to extract strings if not a dir
-                    if (fileDir.isFile() == true) {
-                        processNonIngestible(fileDir);
+                    ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
+                    //try to extract strings if a file
+                    if (fsContent.isFile() == true) {
+                        processNonIngestible(fsContent);
                    }
                }
            } else {
-                //unallocated or unsupported type by Solr
+                //unallocated file or unsupported content type by Solr
                processNonIngestible(aFile);
-
            }
        }

@ -547,10 +590,10 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
    }

    /**
-     * Searcher responsible for searching the current index and writing results to blackboard
-     * and the inbox.  Also, posts results to listeners as Ingest data events.
-     * Searches entire index, and keeps track of only new results to report and save.
-     * Runs as a background thread.
+     * Searcher responsible for searching the current index and writing results
+     * to blackboard and the inbox. Also, posts results to listeners as Ingest
+     * data events. Searches entire index, and keeps track of only new results
+     * to report and save. Runs as a background thread.
     */
    private class Searcher extends SwingWorker<Object, Void> {

@ -574,7 +617,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

            final String displayName = "Keyword Search" + (finalRun ? " - Finalizing" : "");
            progress = ProgressHandleFactory.createHandle(displayName + (" (Pending)"), new Cancellable() {
-
                @Override
                public boolean cancel() {
                    logger.log(Level.INFO, "Cancelling the searcher by user.");
@ -833,14 +875,14 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
        //without relying on done() method that is not guaranteed to run after background thread completes
        //NEED to call this method always right before doInBackground() returns
        /**
-         * Performs the cleanup that needs to be done right AFTER doInBackground() returns
-         * without relying on done() method that is not guaranteed to run after background thread completes
-         * REQUIRED to call this method always right before doInBackground() returns
+         * Performs the cleanup that needs to be done right AFTER
+         * doInBackground() returns without relying on done() method that is not
+         * guaranteed to run after background thread completes REQUIRED to call
+         * this method always right before doInBackground() returns
         */
        private void finalizeSearcher() {
            logger.log(Level.INFO, "Searcher finalizing");
            SwingUtilities.invokeLater(new Runnable() {
-
                @Override
                public void run() {
                    progress.finish();
@ -871,9 +913,9 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

    /**
     * Checks if the content has already been hit previously
-     * 
+     *
     * @param previousHits the previous hits to check against
-     * @param hit a hit to check for,  that potentially had already been hit
+     * @param hit a hit to check for, that potentially had already been hit
     * @return true if the potential hit has already been hit, false otherwise
     */
    private static boolean previouslyHit(List<ContentHit> previousHits, ContentHit hit) {
@ -890,7 +932,9 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi

    /**
     * Set the skip known files setting on the service
-     * @param skip true if skip, otherwise, will process known files as well, as reported by HashDB service
+     *
+     * @param skip true if skip, otherwise, will process known files as well, as
+     * reported by HashDB service
     */
    void setSkipKnown(boolean skip) {
        this.skipKnown = skip;