Merge pull request #2513 from esaunders/2290-store-doc-ids

Add document ids and search type to keyword hit artifacts
This commit is contained in:
Richard Cordovano 2017-02-08 16:59:24 -05:00 committed by GitHub
commit 764c01e820
6 changed files with 85 additions and 50 deletions

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Copyright 2011-2017 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -44,7 +44,7 @@ public class KeywordSearch {
public enum QueryType {
LITERAL, REGEX
LITERAL, SUBSTRING, REGEX
};
public static final String NUM_FILES_CHANGE_EVT = "NUM_FILES_CHANGE_EVT"; //NON-NLS
private static PropertyChangeSupport changeSupport = new PropertyChangeSupport(KeywordSearch.class);

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Copyright 2011-2017 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -97,6 +97,6 @@ interface KeywordSearchQuery {
*/
String getEscapedQueryString();
KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String termHit, KeywordHit hit, String snippet, String listName);
KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword keyword, KeywordHit hit, String snippet, String listName);
}

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Copyright 2011-2017 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -50,7 +50,7 @@ class LuceneQuery implements KeywordSearchQuery {
private final String keywordString; //original unescaped query
private String keywordStringEscaped;
private boolean isEscaped;
private Keyword keyword = null;
private Keyword originalKeyword = null;
private KeywordList keywordList = null;
private final List<KeywordQueryFilter> filters = new ArrayList<>();
private String field = null;
@ -67,7 +67,7 @@ class LuceneQuery implements KeywordSearchQuery {
*/
public LuceneQuery(KeywordList keywordList, Keyword keyword) {
this.keywordList = keywordList;
this.keyword = keyword;
this.originalKeyword = keyword;
// @@@ BC: Long-term, we should try to get rid of this string and use only the
// keyword object. Refactoring did not make its way through this yet.
@ -134,7 +134,7 @@ class LuceneQuery implements KeywordSearchQuery {
}
@Override
public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String termHit, KeywordHit hit, String snippet, String listName) {
public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword foundKeyword, KeywordHit hit, String snippet, String listName) {
final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();
Collection<BlackboardAttribute> attributes = new ArrayList<>();
@ -151,7 +151,7 @@ class LuceneQuery implements KeywordSearchQuery {
if (snippet != null) {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet));
}
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, termHit));
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm()));
if ((listName != null) && (listName.equals("") == false)) {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName));
}
@ -159,10 +159,16 @@ class LuceneQuery implements KeywordSearchQuery {
//bogus - workaround the dir tree table issue
//attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP.getTypeID(), MODULE_NAME, "", ""));
//selector
if (keyword != null) {
BlackboardAttribute.ATTRIBUTE_TYPE selType = keyword.getArtifactAttributeType();
if (originalKeyword != null) {
BlackboardAttribute.ATTRIBUTE_TYPE selType = originalKeyword.getArtifactAttributeType();
if (selType != null) {
attributes.add(new BlackboardAttribute(selType, MODULE_NAME, termHit));
attributes.add(new BlackboardAttribute(selType, MODULE_NAME, foundKeyword.getSearchTerm()));
}
if (originalKeyword.searchTermIsWholeWord()) {
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.LITERAL.ordinal()));
} else {
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.SUBSTRING.ordinal()));
}
}
@ -259,7 +265,7 @@ class LuceneQuery implements KeywordSearchQuery {
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
// Wrap the query string in quotes if this is a literal search term.
String theQueryStr = keyword.searchTermIsLiteral()
String theQueryStr = originalKeyword.searchTermIsLiteral()
? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
// Run the query against an optional alternative field.

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2014-2015 Basis Technology Corp.
* Copyright 2014-2017 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -149,7 +149,7 @@ class QueryResults {
}
}
if (snippet != null) {
KeywordCachedArtifact writeResult = keywordSearchQuery.writeSingleFileHitsToBlackBoard(termString, hit, snippet, keywordList.getName());
KeywordCachedArtifact writeResult = keywordSearchQuery.writeSingleFileHitsToBlackBoard(keyword, hit, snippet, keywordList.getName());
if (writeResult != null) {
newArtifacts.add(writeResult.getArtifact());
if (notifyInbox) {

View File

@ -24,8 +24,10 @@ import com.google.common.collect.ListMultimap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -73,13 +75,15 @@ final class RegexQuery implements KeywordSearchQuery {
private final List<KeywordQueryFilter> filters = new ArrayList<>();
private final KeywordList keywordList;
private final Keyword keyword;
private final Keyword originalKeyword; // The regular expression originalKeyword used to perform the search.
private String field = Server.Schema.CONTENT_STR.toString();
private final String keywordString;
static final private int MAX_RESULTS = 512;
private boolean escaped;
private String escapedQuery;
private final ListMultimap<Keyword, KeywordHit> hitsMultiMap = ArrayListMultimap.create();
// Lucene regular expressions do not support the following Java predefined
// and POSIX character classes. There are other valid Java character classes
// that are not supported by Lucene but we do not check for all of them.
@ -102,7 +106,7 @@ final class RegexQuery implements KeywordSearchQuery {
*/
RegexQuery(KeywordList keywordList, Keyword keyword) {
this.keywordList = keywordList;
this.keyword = keyword;
this.originalKeyword = keyword;
this.keywordString = keyword.getSearchTerm();
if (this.keywordString.startsWith(".*")) {
@ -147,8 +151,6 @@ final class RegexQuery implements KeywordSearchQuery {
public QueryResults performQuery() throws NoOpenCoreException {
QueryResults results = new QueryResults(this, keywordList);
ListMultimap<Keyword, KeywordHit> hitsMultMap = ArrayListMultimap.create();
final Server solrServer = KeywordSearch.getServer();
SolrQuery solrQuery = new SolrQuery();
@ -201,7 +203,7 @@ final class RegexQuery implements KeywordSearchQuery {
try {
List<KeywordHit> keywordHits = createKeywordHits(resultDoc);
for (KeywordHit hit : keywordHits) {
hitsMultMap.put(new Keyword(hit.getHit(), true), hit);
hitsMultiMap.put(new Keyword(hit.getHit(), true), hit);
}
} catch (TskException ex) {
//
@ -219,8 +221,8 @@ final class RegexQuery implements KeywordSearchQuery {
}
}
for (Keyword k : hitsMultMap.keySet()) {
results.addResult(k, hitsMultMap.get(k));
for (Keyword k : hitsMultiMap.keySet()) {
results.addResult(k, hitsMultiMap.get(k));
}
return results;
@ -234,9 +236,10 @@ final class RegexQuery implements KeywordSearchQuery {
final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
final Pattern pattern = Pattern.compile(keywordString);
for (Object content_obj : content_str) {
String content = (String) content_obj;
Matcher hitMatcher = Pattern.compile(keywordString).matcher(content);
Matcher hitMatcher = pattern.matcher(content);
int offset = 0;
while (hitMatcher.find(offset)) {
@ -251,17 +254,13 @@ final class RegexQuery implements KeywordSearchQuery {
String hit = hitMatcher.group();
// Back the matcher offset up by 1 character as it will have eaten
// a single space/newline/other boundary character at the end of the hit.
// This was causing us to miss hits that appeared consecutively in the
// input where they were separated by a single boundary character.
offset = hitMatcher.end() - 1;
offset = hitMatcher.end();
/*
* If searching for credit card account numbers, do a Luhn check
* on the term and discard it if it does not pass.
*/
if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
if (originalKeyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
if (ccnMatcher.find()) {
final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
@ -279,11 +278,11 @@ final class RegexQuery implements KeywordSearchQuery {
*/
if (KeywordSearchSettings.getShowSnippets()) {
int maxIndex = content.length() - 1;
snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1)));
snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start())));
snippet.appendCodePoint(171);
snippet.append(hit);
snippet.appendCodePoint(171);
snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20)));
snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end()), Integer.min(maxIndex, hitMatcher.end() + 20)));
}
hits.add(new KeywordHit(docId, snippet.toString(), hit));
@ -326,7 +325,7 @@ final class RegexQuery implements KeywordSearchQuery {
@Override
public String getQueryString() {
return keyword.getSearchTerm();
return originalKeyword.getSearchTerm();
}
@Override
@ -337,10 +336,32 @@ final class RegexQuery implements KeywordSearchQuery {
return escapedQuery;
}
/**
* Get a unique, comma separated list of document ids that match the given hit
* for the same object.
* @param keyword The keyword object that resulted in one or more hits.
* @param hit The specific hit for which we want to identify all other chunks
* that match the keyword
* @return A comma separated list of unique document ids.
*/
private String getDocumentIds(Keyword keyword, KeywordHit hit) {
Set<String> documentIds = new HashSet<>();
for (KeywordHit h : hitsMultiMap.get(keyword)) {
// Add the document id only if it is for the same object as the
// given hit and we haven't already seen it.
if (h.getSolrObjectId() == hit.getSolrObjectId() && !documentIds.contains(h.getSolrDocumentId())) {
documentIds.add(h.getSolrDocumentId());
}
}
return StringUtils.join(documentIds, ",");
}
/**
* Converts the keyword hits for a given search term into artifacts.
*
* @param searchTerm The search term.
* @param foundKeyword The keyword that was found by the regex search.
* @param hit The keyword hit.
* @param snippet The document snippet that contains the hit
* @param listName The name of the keyword list that contained the keyword
@ -354,7 +375,7 @@ final class RegexQuery implements KeywordSearchQuery {
// TODO: Are we actually making meaningful use of the KeywordCachedArtifact
// class?
@Override
public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String searchTerm, KeywordHit hit, String snippet, String listName) {
public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword foundKeyword, KeywordHit hit, String snippet, String listName) {
/*
* Create either a "plain vanilla" keyword hit artifact with keyword and
* regex attributes, or a credit card account artifact with attributes
@ -363,9 +384,9 @@ final class RegexQuery implements KeywordSearchQuery {
*/
BlackboardArtifact newArtifact;
Collection<BlackboardAttribute> attributes = new ArrayList<>();
if (keyword.getArtifactAttributeType() != BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, searchTerm));
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, keyword.getSearchTerm()));
if (originalKeyword.getArtifactAttributeType() != BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm()));
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, getQueryString()));
try {
newArtifact = hit.getContent().newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT);
@ -391,9 +412,9 @@ final class RegexQuery implements KeywordSearchQuery {
final BlackboardAttribute ccnAttribute = parsedTrackAttributeMap.get(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER));
if (ccnAttribute == null || StringUtils.isBlank(ccnAttribute.getValueString())) {
if (hit.isArtifactHit()) {
LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", searchTerm, hit.getSnippet(), hit.getArtifact().getArtifactID())); //NON-NLS
LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getArtifact().getArtifactID())); //NON-NLS
} else {
LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", searchTerm, hit.getSnippet(), hit.getContent().getId())); //NON-NLS
LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getContent().getId())); //NON-NLS
}
return null;
}
@ -458,6 +479,11 @@ final class RegexQuery implements KeywordSearchQuery {
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, hit.getArtifact().getArtifactID()));
}
// Add comma separated list of document ids that had a hit for this file/artifact.
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_HIT_DOCUMENT_IDS, MODULE_NAME, getDocumentIds(foundKeyword, hit)));
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.REGEX.ordinal()));
try {
newArtifact.addAttributes(attributes);
KeywordCachedArtifact writeResult = new KeywordCachedArtifact(newArtifact);

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Copyright 2011-2017 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -62,7 +62,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
private static final boolean DEBUG_FLAG = Version.Type.DEVELOPMENT.equals(Version.getBuildType());
private static final int MAX_TERMS_QUERY_RESULTS = 20000;
private final KeywordList keywordList;
private final Keyword keyword;
private final Keyword originalKeyword;
private String searchTerm;
private boolean searchTermIsEscaped;
private final List<KeywordQueryFilter> filters = new ArrayList<>(); // THIS APPEARS TO BE UNUSED
@ -135,7 +135,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
// if needed, here in the constructor?
TermsComponentQuery(KeywordList keywordList, Keyword keyword) {
this.keywordList = keywordList;
this.keyword = keyword;
this.originalKeyword = keyword;
this.searchTerm = keyword.getSearchTerm();
}
@ -158,7 +158,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
*/
@Override
public String getQueryString() {
return keyword.getSearchTerm();
return originalKeyword.getSearchTerm();
}
/**
@ -187,7 +187,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
*/
@Override
public void escape() {
searchTerm = Pattern.quote(keyword.getSearchTerm());
searchTerm = Pattern.quote(originalKeyword.getSearchTerm());
searchTermIsEscaped = true;
}
@ -286,7 +286,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
* If searching for credit card account numbers, do a Luhn check on
* the term and discard it if it does not pass.
*/
if (keyword.getArtifactAttributeType() == ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
if (originalKeyword.getArtifactAttributeType() == ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
Matcher matcher = CREDIT_CARD_NUM_PATTERN.matcher(term.getTerm());
matcher.find();
final String ccn = CharMatcher.anyOf(" -").removeFrom(matcher.group("ccn"));
@ -321,7 +321,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
/**
* Converts the keyword hits for a given search term into artifacts.
*
* @param searchTerm The search term.
* @param foundKeyword The keyword that was found by the search.
* @param hit The keyword hit.
* @param snippet The document snippet that contains the hit
* @param listName The name of the keyword list that contained the keyword
@ -335,7 +335,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
// TODO: Are we actually making meaningful use of the KeywordCachedArtifact
// class?
@Override
public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String searchTerm, KeywordHit hit, String snippet, String listName) {
public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword foundKeyword, KeywordHit hit, String snippet, String listName) {
/*
* Create either a "plain vanilla" keyword hit artifact with keyword and
* regex attributes, or a credit card account artifact with attributes
@ -344,9 +344,9 @@ final class TermsComponentQuery implements KeywordSearchQuery {
*/
BlackboardArtifact newArtifact;
Collection<BlackboardAttribute> attributes = new ArrayList<>();
if (keyword.getArtifactAttributeType() != ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, searchTerm));
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, keyword.getSearchTerm()));
if (originalKeyword.getArtifactAttributeType() != ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm()));
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, originalKeyword.getSearchTerm()));
try {
newArtifact = hit.getContent().newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
@ -439,6 +439,9 @@ final class TermsComponentQuery implements KeywordSearchQuery {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, hit.getArtifact().getArtifactID()));
}
// TermsComponentQuery is now being used exclusively for substring searches.
attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.SUBSTRING.ordinal()));
try {
newArtifact.addAttributes(attributes);
KeywordCachedArtifact writeResult = new KeywordCachedArtifact(newArtifact);