keyword search: fixes duplicates during ingest for Local Files - when local file set added, ensures to only search the currently ingested sources

2025-07-12 07:56:16 +00:00 · 2013-06-18 12:48:39 -04:00 · 2013-06-18 12:48:39 -04:00 · 3415c49b04
commit 3415c49b04
parent 2167f41bbe
3 changed files with 42 additions and 39 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -41,6 +41,7 @@ import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.SolrInputDocument;
 import org.openide.util.Exceptions;
+import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.keywordsearch.Server.SolrServerNoPortException;
@ -52,10 +53,10 @@ import org.sleuthkit.datamodel.DerivedFile;
 import org.sleuthkit.datamodel.Directory;
 import org.sleuthkit.datamodel.File;
 import org.sleuthkit.datamodel.FsContent;
-import org.sleuthkit.datamodel.Image;
 import org.sleuthkit.datamodel.LayoutFile;
 import org.sleuthkit.datamodel.LocalFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
+import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;

 /**
@ -184,6 +185,12 @@ public class Ingester {

    private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {

+        private SleuthkitCase curCase = null;
+        
+        GetContentFieldsV() {
+            curCase = Case.getCurrentCase().getSleuthkitCase();
+        }
+        
        @Override
        protected Map<String, String> defaultVisit(Content cntnt) {
            return new HashMap<String, String>();
@ -217,11 +224,7 @@ public class Ingester {
        
        @Override
        public Map<String, String> visit(LocalFile lf) {
-            final Map<String, String> params = new HashMap<String, String>();
-            params.put(Server.Schema.ID.toString(), Long.toString(lf.getId()));
-            params.put(Server.Schema.FILE_NAME.toString(), lf.getName());
-            params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
-            return params;
+            return getCommonFields(lf);
        }

        private Map<String, String> getCommonFsContentFields(Map<String, String> params, FsContent fsContent) {
@ -235,15 +238,13 @@ public class Ingester {
        private Map<String, String> getCommonFields(AbstractFile af) {
            Map<String, String> params = new HashMap<String, String>();
            params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
-            long imageId = -1;
+            long dataSourceId = -1;
            try {
-                Image image = af.getImage();
-                if (image != null) {
-                    imageId = image.getId();
-                }
-                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(imageId));
+                dataSourceId = curCase.getFileDataSource(af);
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
            } catch (TskCoreException ex) {
-                logger.log(Level.SEVERE, "Could not get image id to properly index the file " + af.getId());
+                logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId());
+                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
            }

            params.put(Server.Schema.FILE_NAME.toString(), af.getName());
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordQueryFilter.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordQueryFilter.java
@ -18,6 +18,10 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
 /**
 *
 * Filter to restrict query only specific files, chunks, images
@ -27,23 +31,23 @@ public class KeywordQueryFilter {

    public static enum FilterType {

-        FILE, CHUNK, IMAGE
+        FILE, CHUNK, DATA_SOURCE
    };
-    private long[] idFilters;
+    private Set<Long>idFilters;
    private FilterType filterType;

    public KeywordQueryFilter(FilterType filterType, long id) {
        this.filterType = filterType;
-        this.idFilters = new long[1];
-        this.idFilters[0] = id;
+        this.idFilters = new HashSet<Long>();
+        this.idFilters.add(id);
    }

-    public KeywordQueryFilter(FilterType filterType, long[] ids) {
+    public KeywordQueryFilter(FilterType filterType, Set<Long>ids) {
        this.filterType = filterType;
        this.idFilters = ids;
    }

-    public long[] getIdFilters() {
+    public Set<Long> getIdFilters() {
        return idFilters;
    }

@ -55,12 +59,14 @@ public class KeywordQueryFilter {
    public String toString() {
        StringBuilder sb = new StringBuilder();
        String id = null;
-        for (int i = 0; i < idFilters.length; ++i) {
+        
+        Iterator<Long>it = idFilters.iterator();
+        for (int i = 0; it.hasNext(); ++i) {
            if (i > 0) {
                sb.append(" "); //OR
            }
-            long idVal = idFilters[i];
-            if (filterType == FilterType.IMAGE) {
+            long idVal = it.next();
+            if (filterType == FilterType.DATA_SOURCE) {
                id = Server.Schema.IMAGE_ID.toString();
            } else {
                id = Server.Schema.ID.toString();
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -44,6 +44,7 @@ import org.netbeans.api.progress.aggregate.AggregateProgressFactory;
 import org.netbeans.api.progress.aggregate.AggregateProgressHandle;
 import org.netbeans.api.progress.aggregate.ProgressContributor;
 import org.openide.util.Cancellable;
+import org.openide.util.Exceptions;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.EscapeUtil;
 import org.sleuthkit.autopsy.coreutils.StopWatch;
@ -60,6 +61,7 @@ import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.Image;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.sleuthkit.datamodel.SleuthkitCase;
@ -115,7 +117,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
    private Map<Keyword, List<Long>> currentResults;
    //only search images from current ingest, not images previously ingested/indexed
    //accessed read-only by searcher thread
-    private Set<Long> curImageIds;
+    private Set<Long> curDataSourceIds;
    private static final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(true); //use fairness policy
    private static final Lock searcherLock = rwLock.writeLock();
    private volatile int messageID = 0;
@ -129,6 +131,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
    private KeywordSearchConfigurationPanel panel;
    private Tika tikaFormatDetector;
    
+
    private enum IngestStatus {

        INGESTED, EXTRACTED_INGESTED, SKIPPED, INGESTED_META
@ -160,12 +163,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
            return ProcessResult.OK;
        }
        try {
-            //add image id of the file to the set, keeping track of images being ingested
-            final Image fileImage = abstractFile.getImage();
-            if (fileImage != null) {
-                //not all Content objects have an image associated (e.g. LocalFiles)
-                curImageIds.add(fileImage.getId());
-            }
+            //add data source id of the file to the set, keeping track of images being ingested
+            final long fileSourceId = caseHandle.getFileDataSource(abstractFile);
+            curDataSourceIds.add(fileSourceId);
+
        } catch (TskCoreException ex) {
            logger.log(Level.SEVERE, "Error getting image id of file processed by keyword search: " + abstractFile.getName(), ex);
        }
@ -288,7 +289,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
    private void cleanup() {
        ingestStatus.clear();
        currentResults.clear();
-        curImageIds.clear();
+        curDataSourceIds.clear();
        currentSearcher = null;
        //finalSearcher = null; //do not collect, might be finalizing

@ -399,7 +400,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
        //keeps track of all results per run not to repeat reporting the same hits
        currentResults = new HashMap<Keyword, List<Long>>();

-        curImageIds = new HashSet<Long>();
+        curDataSourceIds = new HashSet<Long>();

        indexer = new Indexer();

@ -930,15 +931,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
                        del = new TermComponentQuery(keywordQuery);
                    }

-                    //limit search to currently ingested images
-                    final long imageIds[] = new long[curImageIds.size()];
-                    final Iterator<Long> it = curImageIds.iterator();
-                    for (int imageI = 0; it.hasNext(); ++imageI) {
-                        imageIds[imageI] = it.next();
-                    }
+                    //limit search to currently ingested data sources
                    //set up a filter with 1 or more image ids OR'ed
-                    final KeywordQueryFilter imageFilter = new KeywordQueryFilter(KeywordQueryFilter.FilterType.IMAGE, imageIds);
-                    del.addFilter(imageFilter);
+                    final KeywordQueryFilter dataSourceFilter = new KeywordQueryFilter(KeywordQueryFilter.FilterType.DATA_SOURCE, curDataSourceIds);
+                    del.addFilter(dataSourceFilter);

                    Map<String, List<ContentHit>> queryResult = null;