Keyword search ingest improvements: alternative performQueryPerTerm() and equivalent bb write methods - more optimal for grouping results per hit (in ingest) rather then per query.

Improves overall keyword search ingest performance.
Improve regex results highlight by using content_ws field.
This commit is contained in:
adam-m 2012-03-27 00:16:28 -04:00
parent 30a2e3f9f4
commit 69d55f7cc1
6 changed files with 229 additions and 124 deletions

View File

@ -534,7 +534,7 @@
<!-- field with white-space tokenized words for TermsComponent regex search (useful for fast search of IP addresses, URLs, certain phone numbers)
also be useful for Lucene based queries containing special characters-->
<field name="content_ws" type="text_ws" indexed="true" stored="false" multiValued="true"/>
<field name="content_ws" type="text_ws" indexed="true" stored="true" multiValued="true"/>
<!-- Uncommenting the following will create a "timestamp" field using
a default value of "NOW" to indicate when each document was indexed.

View File

@ -57,7 +57,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
public static final String MODULE_NAME = "Keyword Search";
private static KeywordSearchIngestService instance = null;
private IngestManagerProxy managerProxy;
private static final long MAX_STRING_EXTRACT_SIZE = 10 * (1 << 10); // * (1 << 10);
private static final long MAX_STRING_EXTRACT_SIZE = 1 * (1 << 10); // * (1 << 10); TODO increase
private static final long MAX_INDEX_SIZE = 100 * (1 << 10) * (1 << 10);
private Ingester ingester;
private volatile boolean commitIndex = false; //whether to commit index next time
@ -78,7 +78,6 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
private volatile boolean finalRunComplete = false;
private final String hashDBServiceName = "Hash Lookup";
private SleuthkitCase caseHandle = null;
// TODO: use a more robust method than checking file extension to determine
// whether to try a file
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
@ -89,8 +88,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
public enum IngestStatus {
INGESTED, EXTRACTED_INGESTED, SKIPPED,
};
INGESTED, EXTRACTED_INGESTED, SKIPPED,};
private Map<Long, IngestStatus> ingestStatus;
private Map<String, List<FsContent>> reportedHits; //already reported hits
@ -109,14 +107,14 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
//logger.log(Level.INFO, "hashdb result: " + hashDBResult + "file: " + fsContent.getName());
if (hashDBResult == IngestServiceFsContent.ProcessResult.COND_STOP) {
return ProcessResult.OK;
}
else if (hashDBResult == IngestServiceFsContent.ProcessResult.ERROR) {
} else if (hashDBResult == IngestServiceFsContent.ProcessResult.ERROR) {
//notify depending service that keyword search (would) encountered error for this file
return ProcessResult.ERROR;
}
if (processedFiles == false)
if (processedFiles == false) {
processedFiles = true;
}
//check if time to commit and previous search is not running
//commiting while searching causes performance issues
@ -289,9 +287,9 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
@Override
public synchronized boolean backgroundJobsCompleteListener(PropertyChangeListener l) {
if (finalRunComplete == true)
if (finalRunComplete == true) {
return false;
else {
} else {
pcs.addPropertyChangeListener(l);
return true;
}
@ -504,7 +502,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
}
}
private class Searcher extends SwingWorker<Object,Void> {
private class Searcher extends SwingWorker<Object, Void> {
private List<Keyword> keywords;
private ProgressHandle progress;
@ -547,52 +545,69 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
KeywordSearchQuery del = null;
if (keywordQuery.isLiteral()) {
boolean isRegex = !keywordQuery.isLiteral();
if (!isRegex) {
del = new LuceneQuery(keywordQuery);
del.escape();
} else {
del = new TermComponentQuery(keywordQuery);
}
List<FsContent> queryResult = null;
Map<String, List<FsContent>> queryResult = null;
try {
queryResult = del.performQuery();
queryResult = del.performQueryPerTerm();
} catch (Exception e) {
logger.log(Level.INFO, "Error performing query: " + keywordQuery.getQuery(), e);
continue;
}
//calculate new results but substracting results already obtained in this run
List<FsContent> newResults = new ArrayList<FsContent>();
Map<Keyword, List<FsContent>> newResults = new HashMap<Keyword, List<FsContent>>();
List<FsContent> curResults = currentResults.get(keywordQuery);
if (curResults == null) {
currentResults.put(keywordQuery, queryResult);
newResults = queryResult;
for (String termResult : queryResult.keySet()) {
List<FsContent> queryTermResults = queryResult.get(termResult);
Keyword termResultK = new Keyword(termResult, !isRegex);
List<FsContent> curTermResults = currentResults.get(termResultK);
if (curTermResults == null) {
currentResults.put(termResultK, queryTermResults);
newResults.put(termResultK, queryTermResults);
} else {
for (FsContent res : queryResult) {
if (!curResults.contains(res)) {
//some fscontent hits already exist for this keyword
for (FsContent res : queryTermResults) {
if (!curTermResults.contains(res)) {
//add to new results
newResults.add(res);
List<FsContent> newResultsFs = newResults.get(termResultK);
if (newResultsFs == null) {
newResultsFs = new ArrayList<FsContent>();
newResults.put(termResultK, newResultsFs);
}
newResultsFs.add(res);
curTermResults.add(res);
}
}
}
//update current result with new ones
curResults.addAll(newResults);
}
if (!newResults.isEmpty()) {
//write results to BB
Collection<BlackboardArtifact> newArtifacts = new ArrayList<BlackboardArtifact>(); //new artifacts to report
for (FsContent hitFile : newResults) {
for (final Keyword hitTerm : newResults.keySet()) {
List<FsContent> fsContentHits = newResults.get(hitTerm);
for (final FsContent hitFile : fsContentHits) {
if (this.isCancelled()) {
return null;
}
Collection<KeywordWriteResult> written = del.writeToBlackBoard(hitFile, listName);
for (KeywordWriteResult res : written) {
newArtifacts.add(res.getArtifact());
KeywordWriteResult written = del.writeToBlackBoard(hitTerm.getQuery(), hitFile, listName);
if (written == null) {
logger.log(Level.INFO, "BB artifact for keyword not written: " + hitTerm.toString());
continue;
}
newArtifacts.add(written.getArtifact());
//generate a data message for each artifact
StringBuilder subjectSb = new StringBuilder();
@ -606,7 +621,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
}
subjectSb.append("<");
String uniqueKey = null;
BlackboardAttribute attr = res.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD.getTypeID());
BlackboardAttribute attr = written.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD.getTypeID());
if (attr != null) {
final String keyword = attr.getValueString();
subjectSb.append(keyword);
@ -625,7 +640,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
detailsSb.append("</tr>");
//preview
attr = res.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW.getTypeID());
attr = written.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW.getTypeID());
if (attr != null) {
detailsSb.append("<tr>");
detailsSb.append("<th>Preview</th>");
@ -642,7 +657,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
//list
attr = res.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SET.getTypeID());
attr = written.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SET.getTypeID());
detailsSb.append("<tr>");
detailsSb.append("<th>List</th>");
detailsSb.append("<td>").append(attr.getValueString()).append("</td>");
@ -650,7 +665,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
//regex
if (!keywordQuery.isLiteral()) {
attr = res.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP.getTypeID());
attr = written.getAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP.getTypeID());
if (attr != null) {
detailsSb.append("<tr>");
detailsSb.append("<th>RegEx</th>");
@ -661,9 +676,10 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
}
detailsSb.append("</table>");
managerProxy.postMessage(IngestMessage.createDataMessage(++messageID, instance, subjectSb.toString(), detailsSb.toString(), uniqueKey, res.getArtifact()));
}
} //for each file hit
managerProxy.postMessage(IngestMessage.createDataMessage(++messageID, instance, subjectSb.toString(), detailsSb.toString(), uniqueKey, written.getArtifact()));
} //for each term hit
}//for each file hit
//update artifact browser
IngestManager.fireServiceDataEvent(new ServiceDataEvent(MODULE_NAME, ARTIFACT_TYPE.TSK_KEYWORD_HIT, newArtifacts));

View File

@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.solr.client.solrj.response.TermsResponse.Term;
import org.sleuthkit.datamodel.FsContent;
@ -34,10 +35,20 @@ public interface KeywordSearchQuery {
/**
* execute query and return results without publishing them
* return results for all matching terms
*
* @return
*/
public List<FsContent> performQuery();
/**
* execute query and return results without publishing them
* return results per term
*
* @return
*/
public Map<String,List<FsContent>> performQueryPerTerm();
/**
* execute the query and publish results
@ -81,5 +92,15 @@ public interface KeywordSearchQuery {
*/
public Collection<KeywordWriteResult> writeToBlackBoard(FsContent newFsHit, String listName);
/**
* write results to blackboard per single term and file hit
* this method is useful if something else should keep track of partial results to write
* @param termHit term for only which to write results
* @param newFsHit fscontent for which to write results for this hit
* @param listName listname
* @return collection of results (with cached bb artifacts/attributes) created and written
*/
public KeywordWriteResult writeToBlackBoard(String termHit, FsContent newFsHit, String listName);
}

View File

@ -35,6 +35,7 @@ import org.sleuthkit.autopsy.datamodel.KeyValue;
import org.sleuthkit.autopsy.keywordsearch.KeywordSearch.QueryType;
import org.sleuthkit.datamodel.FsContent;
/**
* Query manager responsible for running appropriate queries and displaying results
* for single, multi keyword queries, with detailed or collapsed results
@ -143,10 +144,16 @@ public class KeywordSearchQueryManager implements KeywordSearchQuery {
@Override
public List<FsContent> performQuery() {
//not done here
return null;
throw new UnsupportedOperationException("performQuery() unsupported");
}
@Override
public Map<String, List<FsContent>> performQueryPerTerm() {
throw new UnsupportedOperationException("performQueryPerTerm() unsupported");
}
@Override
public boolean validate() {
boolean allValid = true;
@ -192,14 +199,16 @@ public class KeywordSearchQueryManager implements KeywordSearchQuery {
@Override
public Collection<KeywordWriteResult> writeToBlackBoard(FsContent newFsHit, String listName) {
Collection<KeywordWriteResult> ret = new ArrayList<KeywordWriteResult>();
for (KeywordSearchQuery q : queryDelegates) {
ret.addAll(q.writeToBlackBoard(newFsHit, listName));
throw new UnsupportedOperationException("writeToBlackBoard() unsupported by manager");
}
return ret;
}
}
@Override
public KeywordWriteResult writeToBlackBoard(String termHit, FsContent newFsHit, String listName) {
throw new UnsupportedOperationException("writeToBlackBoard() unsupported by manager");
}
}
/**
* custom KeyValue that also stores query object to execute
*/

View File

@ -23,6 +23,7 @@ import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
@ -59,9 +60,8 @@ public class LuceneQuery implements KeywordSearchQuery {
private Keyword keywordQuery = null;
//use different highlight Solr fields for regex and literal search
static final String HIGHLIGHT_FIELD_LITERAL = "content";
//TODO change to content_ws and in Solr schema to stored="true" to improve regex highlight matching
static final String HIGHLIGHT_FIELD_REGEX = "content";
//static final String HIGHLIGHT_FIELD_REGEX = "content_ws";
//static final String HIGHLIGHT_FIELD_REGEX = "content";
static final String HIGHLIGHT_FIELD_REGEX = "content_ws";
public LuceneQuery(Keyword keywordQuery) {
this(keywordQuery.getQuery());
@ -160,6 +160,18 @@ public class LuceneQuery implements KeywordSearchQuery {
return matches;
}
@Override
public Map<String, List<FsContent>> performQueryPerTerm() {
Map<String, List<FsContent>> results = new HashMap<String, List<FsContent>>();
//in case of single term literal query there is only 1 term, so delegate to performQuery()
results.put(query, performQuery());
return results;
}
@Override
public void execute() {
escape();
@ -211,19 +223,28 @@ public class LuceneQuery implements KeywordSearchQuery {
@Override
public Collection<KeywordWriteResult> writeToBlackBoard(FsContent newFsHit, String listName) {
List<KeywordWriteResult> ret = new ArrayList<KeywordWriteResult>();
KeywordWriteResult written = writeToBlackBoard(query, newFsHit, listName);
if (written != null)
ret.add(written);
return ret;
}
@Override
public KeywordWriteResult writeToBlackBoard(String termHit, FsContent newFsHit, String listName) {
final String MODULE_NAME = KeywordSearchIngestService.MODULE_NAME;
Collection<KeywordWriteResult> writeResults = new ArrayList<KeywordWriteResult>();
KeywordWriteResult writeResult = null;
Collection<BlackboardAttribute> attributes = new ArrayList<BlackboardAttribute>();
BlackboardArtifact bba = null;
try {
bba = newFsHit.newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
writeResult = new KeywordWriteResult(bba);
writeResults.add(writeResult);
} catch (Exception e) {
logger.log(Level.INFO, "Error adding bb artifact for keyword hit", e);
return writeResults;
return null;
}
String snippet = null;
@ -236,7 +257,7 @@ public class LuceneQuery implements KeywordSearchQuery {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW.getTypeID(), MODULE_NAME, "", KeywordSearchUtil.escapeForBlackBoard(snippet)));
}
//keyword
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD.getTypeID(), MODULE_NAME, "", query));
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD.getTypeID(), MODULE_NAME, "", termHit));
//list
if (listName == null) {
listName = "";
@ -249,17 +270,18 @@ public class LuceneQuery implements KeywordSearchQuery {
if (keywordQuery != null) {
BlackboardAttribute.ATTRIBUTE_TYPE selType = keywordQuery.getType();
if (selType != null) {
attributes.add(new BlackboardAttribute(selType.getTypeID(), MODULE_NAME, "", query));
attributes.add(new BlackboardAttribute(selType.getTypeID(), MODULE_NAME, "", termHit));
}
}
try {
bba.addAttributes(attributes); //write out to bb
writeResult.add(attributes);
return writeResult;
} catch (TskException e) {
logger.log(Level.INFO, "Error adding bb attributes to artifact", e);
}
return writeResults;
return null;
}
/**

View File

@ -18,8 +18,6 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
@ -30,7 +28,6 @@ import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import javax.swing.SwingWorker;
@ -156,30 +153,20 @@ public class TermComponentQuery implements KeywordSearchQuery {
}
@Override
public Collection<KeywordWriteResult> writeToBlackBoard(FsContent newFsHit, String listName) {
public KeywordWriteResult writeToBlackBoard(String termHit, FsContent newFsHit, String listName) {
final String MODULE_NAME = KeywordSearchIngestService.MODULE_NAME;
Collection<KeywordWriteResult> writeResults = new ArrayList<KeywordWriteResult>();
//get unique term matches, all cases
Map<String, Void> matches = new HashMap<String, Void>();
for (Term term : terms) {
//caseInsMatches.put(term.getTerm().toLowerCase(), null);
matches.put(term.getTerm(), null);
}
for (String regexMatch : matches.keySet()) {
//snippet
String snippet = null;
try {
snippet = LuceneQuery.querySnippet(KeywordSearchUtil.escapeLuceneQuery(regexMatch, true, false), newFsHit.getId(), true);
snippet = LuceneQuery.querySnippet(KeywordSearchUtil.escapeLuceneQuery(termHit, true, false), newFsHit.getId(), true);
} catch (Exception e) {
logger.log(Level.INFO, "Error querying snippet: " + regexMatch, e);
continue;
logger.log(Level.INFO, "Error querying snippet: " + termHit, e);
return null;
}
if (snippet == null || snippet.equals("")) {
continue;
return null;
}
//there is match actually in this file, create artifact only then
@ -189,15 +176,14 @@ public class TermComponentQuery implements KeywordSearchQuery {
try {
bba = newFsHit.newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
writeResult = new KeywordWriteResult(bba);
writeResults.add(writeResult);
} catch (Exception e) {
logger.log(Level.INFO, "Error adding bb artifact for keyword hit", e);
continue;
return null;
}
//regex match
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD.getTypeID(), MODULE_NAME, "", regexMatch));
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD.getTypeID(), MODULE_NAME, "", termHit));
//list
if (listName == null) {
listName = "";
@ -223,16 +209,67 @@ public class TermComponentQuery implements KeywordSearchQuery {
try {
bba.addAttributes(attributes);
writeResult.add(attributes);
return writeResult;
} catch (TskException e) {
logger.log(Level.INFO, "Error adding bb attributes for terms search artifact", e);
}
return null;
}
@Override
public Collection<KeywordWriteResult> writeToBlackBoard(FsContent newFsHit, String listName) {
Collection<KeywordWriteResult> writeResults = new ArrayList<KeywordWriteResult>();
//get unique term matches, all cases
Map<String, Void> matches = new HashMap<String, Void>();
for (Term term : terms) {
//caseInsMatches.put(term.getTerm().toLowerCase(), null);
matches.put(term.getTerm(), null);
}
for (String regexMatch : matches.keySet()) {
KeywordWriteResult written = writeToBlackBoard(regexMatch, newFsHit, listName);
if (written != null)
writeResults.add(written);
} //for each term
return writeResults;
}
@Override
public Map<String, List<FsContent>> performQueryPerTerm() {
Map<String, List<FsContent>> results = new HashMap<String, List<FsContent>>();
final SolrQuery q = createQuery();
terms = executeQuery(q);
for (Term term : terms) {
final String termS = KeywordSearchUtil.escapeLuceneQuery(term.getTerm(), true, false);
if (termS.contains("*")) {
continue;
}
StringBuilder filesQueryB = new StringBuilder();
filesQueryB.append(TERMS_SEARCH_FIELD).append(":").append(termS);
final String queryStr = filesQueryB.toString();
LuceneQuery filesQuery = new LuceneQuery(queryStr);
try {
List<FsContent> subResults = filesQuery.performQuery();
results.put(term.getTerm(), subResults);
} catch (RuntimeException e) {
logger.log(Level.SEVERE, "Error executing Solr query,", e);
}
}
return results;
}
/**
* return collapsed matches with all files for the query
* without per match breakdown