keyword search: fixes duplicates during ingest for Local Files - when local file set added, ensures to only search the currently ingested sources

This commit is contained in:
adam-m 2013-06-18 12:48:39 -04:00
parent 2167f41bbe
commit 3415c49b04
3 changed files with 42 additions and 39 deletions

View File

@ -41,6 +41,7 @@ import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.SolrInputDocument;
import org.openide.util.Exceptions;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.autopsy.keywordsearch.Server.SolrServerNoPortException;
@ -52,10 +53,10 @@ import org.sleuthkit.datamodel.DerivedFile;
import org.sleuthkit.datamodel.Directory;
import org.sleuthkit.datamodel.File;
import org.sleuthkit.datamodel.FsContent;
import org.sleuthkit.datamodel.Image;
import org.sleuthkit.datamodel.LayoutFile;
import org.sleuthkit.datamodel.LocalFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException;
/**
@ -184,6 +185,12 @@ public class Ingester {
private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
private SleuthkitCase curCase = null;
GetContentFieldsV() {
curCase = Case.getCurrentCase().getSleuthkitCase();
}
@Override
protected Map<String, String> defaultVisit(Content cntnt) {
return new HashMap<String, String>();
@ -217,11 +224,7 @@ public class Ingester {
@Override
public Map<String, String> visit(LocalFile lf) {
final Map<String, String> params = new HashMap<String, String>();
params.put(Server.Schema.ID.toString(), Long.toString(lf.getId()));
params.put(Server.Schema.FILE_NAME.toString(), lf.getName());
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
return params;
return getCommonFields(lf);
}
private Map<String, String> getCommonFsContentFields(Map<String, String> params, FsContent fsContent) {
@ -235,15 +238,13 @@ public class Ingester {
private Map<String, String> getCommonFields(AbstractFile af) {
Map<String, String> params = new HashMap<String, String>();
params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
long imageId = -1;
long dataSourceId = -1;
try {
Image image = af.getImage();
if (image != null) {
imageId = image.getId();
}
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(imageId));
dataSourceId = curCase.getFileDataSource(af);
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Could not get image id to properly index the file " + af.getId());
logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId());
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
}
params.put(Server.Schema.FILE_NAME.toString(), af.getName());

View File

@ -18,6 +18,10 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
/**
*
* Filter to restrict query only specific files, chunks, images
@ -27,23 +31,23 @@ public class KeywordQueryFilter {
public static enum FilterType {
FILE, CHUNK, IMAGE
FILE, CHUNK, DATA_SOURCE
};
private long[] idFilters;
private Set<Long>idFilters;
private FilterType filterType;
public KeywordQueryFilter(FilterType filterType, long id) {
this.filterType = filterType;
this.idFilters = new long[1];
this.idFilters[0] = id;
this.idFilters = new HashSet<Long>();
this.idFilters.add(id);
}
public KeywordQueryFilter(FilterType filterType, long[] ids) {
public KeywordQueryFilter(FilterType filterType, Set<Long>ids) {
this.filterType = filterType;
this.idFilters = ids;
}
public long[] getIdFilters() {
public Set<Long> getIdFilters() {
return idFilters;
}
@ -55,12 +59,14 @@ public class KeywordQueryFilter {
public String toString() {
StringBuilder sb = new StringBuilder();
String id = null;
for (int i = 0; i < idFilters.length; ++i) {
Iterator<Long>it = idFilters.iterator();
for (int i = 0; it.hasNext(); ++i) {
if (i > 0) {
sb.append(" "); //OR
}
long idVal = idFilters[i];
if (filterType == FilterType.IMAGE) {
long idVal = it.next();
if (filterType == FilterType.DATA_SOURCE) {
id = Server.Schema.IMAGE_ID.toString();
} else {
id = Server.Schema.ID.toString();

View File

@ -44,6 +44,7 @@ import org.netbeans.api.progress.aggregate.AggregateProgressFactory;
import org.netbeans.api.progress.aggregate.AggregateProgressHandle;
import org.netbeans.api.progress.aggregate.ProgressContributor;
import org.openide.util.Cancellable;
import org.openide.util.Exceptions;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
import org.sleuthkit.autopsy.coreutils.StopWatch;
@ -60,6 +61,7 @@ import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Image;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.SleuthkitCase;
@ -115,7 +117,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private Map<Keyword, List<Long>> currentResults;
//only search images from current ingest, not images previously ingested/indexed
//accessed read-only by searcher thread
private Set<Long> curImageIds;
private Set<Long> curDataSourceIds;
private static final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(true); //use fairness policy
private static final Lock searcherLock = rwLock.writeLock();
private volatile int messageID = 0;
@ -129,6 +131,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private KeywordSearchConfigurationPanel panel;
private Tika tikaFormatDetector;
private enum IngestStatus {
INGESTED, EXTRACTED_INGESTED, SKIPPED, INGESTED_META
@ -160,12 +163,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
return ProcessResult.OK;
}
try {
//add image id of the file to the set, keeping track of images being ingested
final Image fileImage = abstractFile.getImage();
if (fileImage != null) {
//not all Content objects have an image associated (e.g. LocalFiles)
curImageIds.add(fileImage.getId());
}
//add data source id of the file to the set, keeping track of images being ingested
final long fileSourceId = caseHandle.getFileDataSource(abstractFile);
curDataSourceIds.add(fileSourceId);
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Error getting image id of file processed by keyword search: " + abstractFile.getName(), ex);
}
@ -288,7 +289,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private void cleanup() {
ingestStatus.clear();
currentResults.clear();
curImageIds.clear();
curDataSourceIds.clear();
currentSearcher = null;
//finalSearcher = null; //do not collect, might be finalizing
@ -399,7 +400,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
//keeps track of all results per run not to repeat reporting the same hits
currentResults = new HashMap<Keyword, List<Long>>();
curImageIds = new HashSet<Long>();
curDataSourceIds = new HashSet<Long>();
indexer = new Indexer();
@ -930,15 +931,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
del = new TermComponentQuery(keywordQuery);
}
//limit search to currently ingested images
final long imageIds[] = new long[curImageIds.size()];
final Iterator<Long> it = curImageIds.iterator();
for (int imageI = 0; it.hasNext(); ++imageI) {
imageIds[imageI] = it.next();
}
//limit search to currently ingested data sources
//set up a filter with 1 or more image ids OR'ed
final KeywordQueryFilter imageFilter = new KeywordQueryFilter(KeywordQueryFilter.FilterType.IMAGE, imageIds);
del.addFilter(imageFilter);
final KeywordQueryFilter dataSourceFilter = new KeywordQueryFilter(KeywordQueryFilter.FilterType.DATA_SOURCE, curDataSourceIds);
del.addFilter(dataSourceFilter);
Map<String, List<ContentHit>> queryResult = null;