keyword search: fixes duplicates during ingest for Local Files - when local file set added, ensures to only search the currently ingested sources

This commit is contained in:
adam-m 2013-06-18 12:48:39 -04:00
parent 2167f41bbe
commit 3415c49b04
3 changed files with 42 additions and 39 deletions

View File

@ -41,6 +41,7 @@ import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.openide.util.Exceptions; import org.openide.util.Exceptions;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.autopsy.keywordsearch.Server.SolrServerNoPortException; import org.sleuthkit.autopsy.keywordsearch.Server.SolrServerNoPortException;
@ -52,10 +53,10 @@ import org.sleuthkit.datamodel.DerivedFile;
import org.sleuthkit.datamodel.Directory; import org.sleuthkit.datamodel.Directory;
import org.sleuthkit.datamodel.File; import org.sleuthkit.datamodel.File;
import org.sleuthkit.datamodel.FsContent; import org.sleuthkit.datamodel.FsContent;
import org.sleuthkit.datamodel.Image;
import org.sleuthkit.datamodel.LayoutFile; import org.sleuthkit.datamodel.LayoutFile;
import org.sleuthkit.datamodel.LocalFile; import org.sleuthkit.datamodel.LocalFile;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
/** /**
@ -184,6 +185,12 @@ public class Ingester {
private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> { private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
private SleuthkitCase curCase = null;
GetContentFieldsV() {
curCase = Case.getCurrentCase().getSleuthkitCase();
}
@Override @Override
protected Map<String, String> defaultVisit(Content cntnt) { protected Map<String, String> defaultVisit(Content cntnt) {
return new HashMap<String, String>(); return new HashMap<String, String>();
@ -217,11 +224,7 @@ public class Ingester {
@Override @Override
public Map<String, String> visit(LocalFile lf) { public Map<String, String> visit(LocalFile lf) {
final Map<String, String> params = new HashMap<String, String>(); return getCommonFields(lf);
params.put(Server.Schema.ID.toString(), Long.toString(lf.getId()));
params.put(Server.Schema.FILE_NAME.toString(), lf.getName());
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
return params;
} }
private Map<String, String> getCommonFsContentFields(Map<String, String> params, FsContent fsContent) { private Map<String, String> getCommonFsContentFields(Map<String, String> params, FsContent fsContent) {
@ -235,15 +238,13 @@ public class Ingester {
private Map<String, String> getCommonFields(AbstractFile af) { private Map<String, String> getCommonFields(AbstractFile af) {
Map<String, String> params = new HashMap<String, String>(); Map<String, String> params = new HashMap<String, String>();
params.put(Server.Schema.ID.toString(), Long.toString(af.getId())); params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
long imageId = -1; long dataSourceId = -1;
try { try {
Image image = af.getImage(); dataSourceId = curCase.getFileDataSource(af);
if (image != null) { params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
imageId = image.getId();
}
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(imageId));
} catch (TskCoreException ex) { } catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Could not get image id to properly index the file " + af.getId()); logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId());
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
} }
params.put(Server.Schema.FILE_NAME.toString(), af.getName()); params.put(Server.Schema.FILE_NAME.toString(), af.getName());

View File

@ -18,6 +18,10 @@
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
/** /**
* *
* Filter to restrict query only specific files, chunks, images * Filter to restrict query only specific files, chunks, images
@ -27,23 +31,23 @@ public class KeywordQueryFilter {
public static enum FilterType { public static enum FilterType {
FILE, CHUNK, IMAGE FILE, CHUNK, DATA_SOURCE
}; };
private long[] idFilters; private Set<Long>idFilters;
private FilterType filterType; private FilterType filterType;
public KeywordQueryFilter(FilterType filterType, long id) { public KeywordQueryFilter(FilterType filterType, long id) {
this.filterType = filterType; this.filterType = filterType;
this.idFilters = new long[1]; this.idFilters = new HashSet<Long>();
this.idFilters[0] = id; this.idFilters.add(id);
} }
public KeywordQueryFilter(FilterType filterType, long[] ids) { public KeywordQueryFilter(FilterType filterType, Set<Long>ids) {
this.filterType = filterType; this.filterType = filterType;
this.idFilters = ids; this.idFilters = ids;
} }
public long[] getIdFilters() { public Set<Long> getIdFilters() {
return idFilters; return idFilters;
} }
@ -55,12 +59,14 @@ public class KeywordQueryFilter {
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
String id = null; String id = null;
for (int i = 0; i < idFilters.length; ++i) {
Iterator<Long>it = idFilters.iterator();
for (int i = 0; it.hasNext(); ++i) {
if (i > 0) { if (i > 0) {
sb.append(" "); //OR sb.append(" "); //OR
} }
long idVal = idFilters[i]; long idVal = it.next();
if (filterType == FilterType.IMAGE) { if (filterType == FilterType.DATA_SOURCE) {
id = Server.Schema.IMAGE_ID.toString(); id = Server.Schema.IMAGE_ID.toString();
} else { } else {
id = Server.Schema.ID.toString(); id = Server.Schema.ID.toString();

View File

@ -44,6 +44,7 @@ import org.netbeans.api.progress.aggregate.AggregateProgressFactory;
import org.netbeans.api.progress.aggregate.AggregateProgressHandle; import org.netbeans.api.progress.aggregate.AggregateProgressHandle;
import org.netbeans.api.progress.aggregate.ProgressContributor; import org.netbeans.api.progress.aggregate.ProgressContributor;
import org.openide.util.Cancellable; import org.openide.util.Cancellable;
import org.openide.util.Exceptions;
import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.EscapeUtil; import org.sleuthkit.autopsy.coreutils.EscapeUtil;
import org.sleuthkit.autopsy.coreutils.StopWatch; import org.sleuthkit.autopsy.coreutils.StopWatch;
@ -60,6 +61,7 @@ import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE; import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
import org.sleuthkit.datamodel.BlackboardAttribute; import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Image; import org.sleuthkit.datamodel.Image;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.SleuthkitCase; import org.sleuthkit.datamodel.SleuthkitCase;
@ -115,7 +117,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private Map<Keyword, List<Long>> currentResults; private Map<Keyword, List<Long>> currentResults;
//only search images from current ingest, not images previously ingested/indexed //only search images from current ingest, not images previously ingested/indexed
//accessed read-only by searcher thread //accessed read-only by searcher thread
private Set<Long> curImageIds; private Set<Long> curDataSourceIds;
private static final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(true); //use fairness policy private static final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(true); //use fairness policy
private static final Lock searcherLock = rwLock.writeLock(); private static final Lock searcherLock = rwLock.writeLock();
private volatile int messageID = 0; private volatile int messageID = 0;
@ -128,6 +130,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private boolean initialized = false; private boolean initialized = false;
private KeywordSearchConfigurationPanel panel; private KeywordSearchConfigurationPanel panel;
private Tika tikaFormatDetector; private Tika tikaFormatDetector;
private enum IngestStatus { private enum IngestStatus {
@ -160,12 +163,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
return ProcessResult.OK; return ProcessResult.OK;
} }
try { try {
//add image id of the file to the set, keeping track of images being ingested //add data source id of the file to the set, keeping track of images being ingested
final Image fileImage = abstractFile.getImage(); final long fileSourceId = caseHandle.getFileDataSource(abstractFile);
if (fileImage != null) { curDataSourceIds.add(fileSourceId);
//not all Content objects have an image associated (e.g. LocalFiles)
curImageIds.add(fileImage.getId());
}
} catch (TskCoreException ex) { } catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Error getting image id of file processed by keyword search: " + abstractFile.getName(), ex); logger.log(Level.SEVERE, "Error getting image id of file processed by keyword search: " + abstractFile.getName(), ex);
} }
@ -288,7 +289,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private void cleanup() { private void cleanup() {
ingestStatus.clear(); ingestStatus.clear();
currentResults.clear(); currentResults.clear();
curImageIds.clear(); curDataSourceIds.clear();
currentSearcher = null; currentSearcher = null;
//finalSearcher = null; //do not collect, might be finalizing //finalSearcher = null; //do not collect, might be finalizing
@ -399,7 +400,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
//keeps track of all results per run not to repeat reporting the same hits //keeps track of all results per run not to repeat reporting the same hits
currentResults = new HashMap<Keyword, List<Long>>(); currentResults = new HashMap<Keyword, List<Long>>();
curImageIds = new HashSet<Long>(); curDataSourceIds = new HashSet<Long>();
indexer = new Indexer(); indexer = new Indexer();
@ -930,15 +931,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
del = new TermComponentQuery(keywordQuery); del = new TermComponentQuery(keywordQuery);
} }
//limit search to currently ingested images //limit search to currently ingested data sources
final long imageIds[] = new long[curImageIds.size()];
final Iterator<Long> it = curImageIds.iterator();
for (int imageI = 0; it.hasNext(); ++imageI) {
imageIds[imageI] = it.next();
}
//set up a filter with 1 or more image ids OR'ed //set up a filter with 1 or more image ids OR'ed
final KeywordQueryFilter imageFilter = new KeywordQueryFilter(KeywordQueryFilter.FilterType.IMAGE, imageIds); final KeywordQueryFilter dataSourceFilter = new KeywordQueryFilter(KeywordQueryFilter.FilterType.DATA_SOURCE, curDataSourceIds);
del.addFilter(imageFilter); del.addFilter(dataSourceFilter);
Map<String, List<ContentHit>> queryResult = null; Map<String, List<ContentHit>> queryResult = null;