diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearch.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearch.java index 5d5c0847bb..65fc900569 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearch.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearch.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2016 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -44,7 +44,7 @@ public class KeywordSearch { public enum QueryType { - LITERAL, REGEX + LITERAL, SUBSTRING, REGEX }; public static final String NUM_FILES_CHANGE_EVT = "NUM_FILES_CHANGE_EVT"; //NON-NLS private static PropertyChangeSupport changeSupport = new PropertyChangeSupport(KeywordSearch.class); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchQuery.java index 34e530956a..cafd45b3e8 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchQuery.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2016 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -97,6 +97,6 @@ interface KeywordSearchQuery { */ String getEscapedQueryString(); - KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String termHit, KeywordHit hit, String snippet, String listName); + KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword keyword, KeywordHit hit, String snippet, String listName); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java index b51f269f2e..4eb6773ad7 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2016 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -50,7 +50,7 @@ class LuceneQuery implements KeywordSearchQuery { private final String keywordString; //original unescaped query private String keywordStringEscaped; private boolean isEscaped; - private Keyword keyword = null; + private Keyword originalKeyword = null; private KeywordList keywordList = null; private final List filters = new ArrayList<>(); private String field = null; @@ -67,7 +67,7 @@ class LuceneQuery implements KeywordSearchQuery { */ public LuceneQuery(KeywordList keywordList, Keyword keyword) { this.keywordList = keywordList; - this.keyword = keyword; + this.originalKeyword = keyword; // @@@ BC: Long-term, we should try to get rid of this string and use only the // keyword object. Refactoring did not make its way through this yet. @@ -134,7 +134,7 @@ class LuceneQuery implements KeywordSearchQuery { } @Override - public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String termHit, KeywordHit hit, String snippet, String listName) { + public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword foundKeyword, KeywordHit hit, String snippet, String listName) { final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName(); Collection attributes = new ArrayList<>(); @@ -151,7 +151,7 @@ class LuceneQuery implements KeywordSearchQuery { if (snippet != null) { attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet)); } - attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, termHit)); + attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm())); if ((listName != null) && (listName.equals("") == false)) { attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName)); } @@ -159,10 +159,16 @@ class LuceneQuery implements KeywordSearchQuery { //bogus - workaround the dir tree table issue //attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP.getTypeID(), MODULE_NAME, "", "")); //selector - if (keyword != null) { - BlackboardAttribute.ATTRIBUTE_TYPE selType = keyword.getArtifactAttributeType(); + if (originalKeyword != null) { + BlackboardAttribute.ATTRIBUTE_TYPE selType = originalKeyword.getArtifactAttributeType(); if (selType != null) { - attributes.add(new BlackboardAttribute(selType, MODULE_NAME, termHit)); + attributes.add(new BlackboardAttribute(selType, MODULE_NAME, foundKeyword.getSearchTerm())); + } + + if (originalKeyword.searchTermIsWholeWord()) { + attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.LITERAL.ordinal())); + } else { + attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.SUBSTRING.ordinal())); } } @@ -259,7 +265,7 @@ class LuceneQuery implements KeywordSearchQuery { SolrQuery q = new SolrQuery(); q.setShowDebugInfo(DEBUG); //debug // Wrap the query string in quotes if this is a literal search term. - String theQueryStr = keyword.searchTermIsLiteral() + String theQueryStr = originalKeyword.searchTermIsLiteral() ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped; // Run the query against an optional alternative field. diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryResults.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryResults.java index 5e7c369384..5b4654b15d 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryResults.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryResults.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2014-2015 Basis Technology Corp. + * Copyright 2014-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -149,7 +149,7 @@ class QueryResults { } } if (snippet != null) { - KeywordCachedArtifact writeResult = keywordSearchQuery.writeSingleFileHitsToBlackBoard(termString, hit, snippet, keywordList.getName()); + KeywordCachedArtifact writeResult = keywordSearchQuery.writeSingleFileHitsToBlackBoard(keyword, hit, snippet, keywordList.getName()); if (writeResult != null) { newArtifacts.add(writeResult.getArtifact()); if (notifyInbox) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java index c434f808be..fa4841f395 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java @@ -24,8 +24,10 @@ import com.google.common.collect.ListMultimap; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.logging.Level; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -73,13 +75,15 @@ final class RegexQuery implements KeywordSearchQuery { private final List filters = new ArrayList<>(); private final KeywordList keywordList; - private final Keyword keyword; + private final Keyword originalKeyword; // The regular expression originalKeyword used to perform the search. private String field = Server.Schema.CONTENT_STR.toString(); private final String keywordString; static final private int MAX_RESULTS = 512; private boolean escaped; private String escapedQuery; + private final ListMultimap hitsMultiMap = ArrayListMultimap.create(); + // Lucene regular expressions do not support the following Java predefined // and POSIX character classes. There are other valid Java character classes // that are not supported by Lucene but we do not check for all of them. @@ -102,7 +106,7 @@ final class RegexQuery implements KeywordSearchQuery { */ RegexQuery(KeywordList keywordList, Keyword keyword) { this.keywordList = keywordList; - this.keyword = keyword; + this.originalKeyword = keyword; this.keywordString = keyword.getSearchTerm(); if (this.keywordString.startsWith(".*")) { @@ -147,8 +151,6 @@ final class RegexQuery implements KeywordSearchQuery { public QueryResults performQuery() throws NoOpenCoreException { QueryResults results = new QueryResults(this, keywordList); - ListMultimap hitsMultMap = ArrayListMultimap.create(); - final Server solrServer = KeywordSearch.getServer(); SolrQuery solrQuery = new SolrQuery(); @@ -201,7 +203,7 @@ final class RegexQuery implements KeywordSearchQuery { try { List keywordHits = createKeywordHits(resultDoc); for (KeywordHit hit : keywordHits) { - hitsMultMap.put(new Keyword(hit.getHit(), true), hit); + hitsMultiMap.put(new Keyword(hit.getHit(), true), hit); } } catch (TskException ex) { // @@ -219,8 +221,8 @@ final class RegexQuery implements KeywordSearchQuery { } } - for (Keyword k : hitsMultMap.keySet()) { - results.addResult(k, hitsMultMap.get(k)); + for (Keyword k : hitsMultiMap.keySet()) { + results.addResult(k, hitsMultiMap.get(k)); } return results; @@ -234,9 +236,10 @@ final class RegexQuery implements KeywordSearchQuery { final Collection content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString()); + final Pattern pattern = Pattern.compile(keywordString); for (Object content_obj : content_str) { String content = (String) content_obj; - Matcher hitMatcher = Pattern.compile(keywordString).matcher(content); + Matcher hitMatcher = pattern.matcher(content); int offset = 0; while (hitMatcher.find(offset)) { @@ -251,17 +254,13 @@ final class RegexQuery implements KeywordSearchQuery { String hit = hitMatcher.group(); - // Back the matcher offset up by 1 character as it will have eaten - // a single space/newline/other boundary character at the end of the hit. - // This was causing us to miss hits that appeared consecutively in the - // input where they were separated by a single boundary character. - offset = hitMatcher.end() - 1; + offset = hitMatcher.end(); /* * If searching for credit card account numbers, do a Luhn check * on the term and discard it if it does not pass. */ - if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { + if (originalKeyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit); if (ccnMatcher.find()) { final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn")); @@ -279,11 +278,11 @@ final class RegexQuery implements KeywordSearchQuery { */ if (KeywordSearchSettings.getShowSnippets()) { int maxIndex = content.length() - 1; - snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1))); + snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start()))); snippet.appendCodePoint(171); snippet.append(hit); snippet.appendCodePoint(171); - snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20))); + snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end()), Integer.min(maxIndex, hitMatcher.end() + 20))); } hits.add(new KeywordHit(docId, snippet.toString(), hit)); @@ -326,7 +325,7 @@ final class RegexQuery implements KeywordSearchQuery { @Override public String getQueryString() { - return keyword.getSearchTerm(); + return originalKeyword.getSearchTerm(); } @Override @@ -337,10 +336,32 @@ final class RegexQuery implements KeywordSearchQuery { return escapedQuery; } + /** + * Get a unique, comma separated list of document ids that match the given hit + * for the same object. + * @param keyword The keyword object that resulted in one or more hits. + * @param hit The specific hit for which we want to identify all other chunks + * that match the keyword + * @return A comma separated list of unique document ids. + */ + private String getDocumentIds(Keyword keyword, KeywordHit hit) { + Set documentIds = new HashSet<>(); + + for (KeywordHit h : hitsMultiMap.get(keyword)) { + // Add the document id only if it is for the same object as the + // given hit and we haven't already seen it. + if (h.getSolrObjectId() == hit.getSolrObjectId() && !documentIds.contains(h.getSolrDocumentId())) { + documentIds.add(h.getSolrDocumentId()); + } + } + + return StringUtils.join(documentIds, ","); + } + /** * Converts the keyword hits for a given search term into artifacts. * - * @param searchTerm The search term. + * @param foundKeyword The keyword that was found by the regex search. * @param hit The keyword hit. * @param snippet The document snippet that contains the hit * @param listName The name of the keyword list that contained the keyword @@ -354,7 +375,7 @@ final class RegexQuery implements KeywordSearchQuery { // TODO: Are we actually making meaningful use of the KeywordCachedArtifact // class? @Override - public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String searchTerm, KeywordHit hit, String snippet, String listName) { + public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword foundKeyword, KeywordHit hit, String snippet, String listName) { /* * Create either a "plain vanilla" keyword hit artifact with keyword and * regex attributes, or a credit card account artifact with attributes @@ -363,9 +384,9 @@ final class RegexQuery implements KeywordSearchQuery { */ BlackboardArtifact newArtifact; Collection attributes = new ArrayList<>(); - if (keyword.getArtifactAttributeType() != BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { - attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, searchTerm)); - attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, keyword.getSearchTerm())); + if (originalKeyword.getArtifactAttributeType() != BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { + attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm())); + attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, getQueryString())); try { newArtifact = hit.getContent().newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT); @@ -391,9 +412,9 @@ final class RegexQuery implements KeywordSearchQuery { final BlackboardAttribute ccnAttribute = parsedTrackAttributeMap.get(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER)); if (ccnAttribute == null || StringUtils.isBlank(ccnAttribute.getValueString())) { if (hit.isArtifactHit()) { - LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", searchTerm, hit.getSnippet(), hit.getArtifact().getArtifactID())); //NON-NLS + LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getArtifact().getArtifactID())); //NON-NLS } else { - LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", searchTerm, hit.getSnippet(), hit.getContent().getId())); //NON-NLS + LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getContent().getId())); //NON-NLS } return null; } @@ -458,6 +479,11 @@ final class RegexQuery implements KeywordSearchQuery { attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, hit.getArtifact().getArtifactID())); } + // Add comma separated list of document ids that had a hit for this file/artifact. + attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_HIT_DOCUMENT_IDS, MODULE_NAME, getDocumentIds(foundKeyword, hit))); + + attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.REGEX.ordinal())); + try { newArtifact.addAttributes(attributes); KeywordCachedArtifact writeResult = new KeywordCachedArtifact(newArtifact); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermsComponentQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermsComponentQuery.java index ca9b750db0..44cc8ab5e9 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermsComponentQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermsComponentQuery.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2016 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -62,7 +62,7 @@ final class TermsComponentQuery implements KeywordSearchQuery { private static final boolean DEBUG_FLAG = Version.Type.DEVELOPMENT.equals(Version.getBuildType()); private static final int MAX_TERMS_QUERY_RESULTS = 20000; private final KeywordList keywordList; - private final Keyword keyword; + private final Keyword originalKeyword; private String searchTerm; private boolean searchTermIsEscaped; private final List filters = new ArrayList<>(); // THIS APPEARS TO BE UNUSED @@ -135,7 +135,7 @@ final class TermsComponentQuery implements KeywordSearchQuery { // if needed, here in the constructor? TermsComponentQuery(KeywordList keywordList, Keyword keyword) { this.keywordList = keywordList; - this.keyword = keyword; + this.originalKeyword = keyword; this.searchTerm = keyword.getSearchTerm(); } @@ -158,7 +158,7 @@ final class TermsComponentQuery implements KeywordSearchQuery { */ @Override public String getQueryString() { - return keyword.getSearchTerm(); + return originalKeyword.getSearchTerm(); } /** @@ -187,7 +187,7 @@ final class TermsComponentQuery implements KeywordSearchQuery { */ @Override public void escape() { - searchTerm = Pattern.quote(keyword.getSearchTerm()); + searchTerm = Pattern.quote(originalKeyword.getSearchTerm()); searchTermIsEscaped = true; } @@ -286,7 +286,7 @@ final class TermsComponentQuery implements KeywordSearchQuery { * If searching for credit card account numbers, do a Luhn check on * the term and discard it if it does not pass. */ - if (keyword.getArtifactAttributeType() == ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { + if (originalKeyword.getArtifactAttributeType() == ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { Matcher matcher = CREDIT_CARD_NUM_PATTERN.matcher(term.getTerm()); matcher.find(); final String ccn = CharMatcher.anyOf(" -").removeFrom(matcher.group("ccn")); @@ -321,7 +321,7 @@ final class TermsComponentQuery implements KeywordSearchQuery { /** * Converts the keyword hits for a given search term into artifacts. * - * @param searchTerm The search term. + * @param foundKeyword The keyword that was found by the search. * @param hit The keyword hit. * @param snippet The document snippet that contains the hit * @param listName The name of the keyword list that contained the keyword @@ -335,7 +335,7 @@ final class TermsComponentQuery implements KeywordSearchQuery { // TODO: Are we actually making meaningful use of the KeywordCachedArtifact // class? @Override - public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String searchTerm, KeywordHit hit, String snippet, String listName) { + public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(Keyword foundKeyword, KeywordHit hit, String snippet, String listName) { /* * Create either a "plain vanilla" keyword hit artifact with keyword and * regex attributes, or a credit card account artifact with attributes @@ -344,9 +344,9 @@ final class TermsComponentQuery implements KeywordSearchQuery { */ BlackboardArtifact newArtifact; Collection attributes = new ArrayList<>(); - if (keyword.getArtifactAttributeType() != ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { - attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, searchTerm)); - attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, keyword.getSearchTerm())); + if (originalKeyword.getArtifactAttributeType() != ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { + attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm())); + attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, originalKeyword.getSearchTerm())); try { newArtifact = hit.getContent().newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT); @@ -439,6 +439,9 @@ final class TermsComponentQuery implements KeywordSearchQuery { attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, hit.getArtifact().getArtifactID())); } + // TermsComponentQuery is now being used exclusively for substring searches. + attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.SUBSTRING.ordinal())); + try { newArtifact.addAttributes(attributes); KeywordCachedArtifact writeResult = new KeywordCachedArtifact(newArtifact);