From 3f9b161a71fbb7c7d9504ba2b03d9503953da03c Mon Sep 17 00:00:00 2001 From: millmanorama Date: Wed, 1 Feb 2017 16:48:52 +0100 Subject: [PATCH 1/3] look at the values of content_str field separately: title/content --- .../autopsy/keywordsearch/LuceneQuery.java | 16 +- .../autopsy/keywordsearch/RegexQuery.java | 139 +++++++++--------- 2 files changed, 79 insertions(+), 76 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java index 9e99394a34..6a90a688dc 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java @@ -23,7 +23,6 @@ import java.util.Collection; import java.util.Comparator; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.logging.Level; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; @@ -223,18 +222,21 @@ class LuceneQuery implements KeywordSearchQuery { * will get picked up in the next one. */ final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString(); final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); - String content_str = Objects.toString(resultDoc.get(Server.Schema.CONTENT_STR.toString()), null); + final ArrayList get = (ArrayList) resultDoc.get(Server.Schema.CONTENT_STR.toString()); double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion()); if (indexSchemaVersion < 2.0) { //old schema versions don't support chunk_size or the content_str fields, so just accept hits matches.add(createKeywordtHit(highlightResponse, docId)); } else { - //for new schemas, check that the hit is before the chunk/window boundary. - int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString); - //there is no chunksize field for "parent" entries in the index - if (chunkSize != null && firstOccurence < chunkSize) { - matches.add(createKeywordtHit(highlightResponse, docId)); + //check against file name and actual content seperately. + for (String content_str : get) { + //for new schemas, check that the hit is before the chunk/window boundary. + int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString); + //there is no chunksize field for "parent" entries in the index + if (chunkSize != null && firstOccurence > -1 && firstOccurence < chunkSize) { + matches.add(createKeywordtHit(highlightResponse, docId)); + } } } } catch (TskException ex) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java index 2a8f828a40..f188890591 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java @@ -54,16 +54,16 @@ import org.sleuthkit.datamodel.TskData; import org.sleuthkit.datamodel.TskException; /** - * The RegexQuery class supports issuing regular expression queries - * against a Lucene index. It relies on the fact that content is - * stored in it's original form in a "string" field (Server.Schema.CONTENT_STR). - * To indicate to Lucene that these are regular expression queries, the query - * string must be surrounded by '/' characters. Additionally, the characters - * ".*" need to be added both before and after the search term to get hits - * in the middle of text. + * The RegexQuery class supports issuing regular expression queries against a + * Lucene index. It relies on the fact that content is stored in it's original + * form in a "string" field (Server.Schema.CONTENT_STR). To indicate to Lucene + * that these are regular expression queries, the query string must be + * surrounded by '/' characters. Additionally, the characters ".*" need to be + * added both before and after the search term to get hits in the middle of + * text. * - * Regular expression syntax supported by Lucene is not the same as Java - * regular expression syntax. The Lucene syntax is documented here: + * Regular expression syntax supported by Lucene is not the same as Java regular + * expression syntax. The Lucene syntax is documented here: * * https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/util/automaton/RegExp.html */ @@ -94,7 +94,7 @@ final class RegexQuery implements KeywordSearchQuery { // See https://lucene.apache.org/core/6_4_0/core/org/apache/lucene/util/automaton/RegExp.html // for Lucene syntax. // We use \p as a shortcut for all of the character classes of the form \p{XXX}. - private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n", + private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n", "\\t", "\\r", "\\f", "\\a", "\\e", "\\v", "\\V", "\\h", "\\H", "\\p"}; //NON-NLS private boolean queryStringContainsWildcardPrefix = false; @@ -160,19 +160,18 @@ final class RegexQuery implements KeywordSearchQuery { /** * The provided regular expression may include wildcards at the - * beginning and/or end. These wildcards are used to indicate that - * the user wants to find hits for the regex that are embedded - * within other characters. For example, if we are given .*127.0.0.1.* - * as a regular expression, this will produce hits for: - * (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace). - * (b) "abc127.0.0.1def" where the IP address is surrounded by other characters. + * beginning and/or end. These wildcards are used to indicate that the + * user wants to find hits for the regex that are embedded within other + * characters. For example, if we are given .*127.0.0.1.* as a regular + * expression, this will produce hits for: (a) " 127.0.0.1 " as a + * standalone token (surrounded by whitespace). (b) "abc127.0.0.1def" + * where the IP address is surrounded by other characters. * * If we are given this type of regex, we do not need to add our own * wildcards to anchor the query. Otherwise, we need to add wildcard * anchors because Lucene string regex searches default to using ^ and $ * to match the entire string. */ - // We construct the query by surrounding it with slashes (to indicate it is // a regular expression search) and .* as anchors (if the query doesn't // already have them). @@ -193,7 +192,7 @@ final class RegexQuery implements KeywordSearchQuery { solrQuery.setRows(MAX_RESULTS); // Setting the sort order is necessary for cursor based paging to work. solrQuery.setSort(SortClause.asc(Server.Schema.ID.toString())); - + String cursorMark = CursorMarkParams.CURSOR_MARK_START; SolrDocumentList resultList = null; boolean allResultsProcessed = false; @@ -214,7 +213,7 @@ final class RegexQuery implements KeywordSearchQuery { // } } - + String nextCursorMark = response.getNextCursorMark(); if (cursorMark.equals(nextCursorMark)) { allResultsProcessed = true; @@ -225,7 +224,7 @@ final class RegexQuery implements KeywordSearchQuery { MessageNotifyUtil.Notify.error(NbBundle.getMessage(Server.class, "Server.query.exception.msg", keywordString), ex.getCause().getMessage()); } } - + for (Keyword k : hitsMultMap.keySet()) { results.addResult(k, hitsMultMap.get(k)); } @@ -239,7 +238,7 @@ final class RegexQuery implements KeywordSearchQuery { final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString(); final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); - String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS + ArrayList content_str = (ArrayList) solrDoc.get(Server.Schema.CONTENT_STR.toString()); // By default, we create keyword hits on whitespace or punctuation character boundaries. // Having a set of well defined boundary characters produces hits that can @@ -248,8 +247,8 @@ final class RegexQuery implements KeywordSearchQuery { // This behavior can be overridden by the user if they give us a search string // with .* at either the start and/or end of the string. This basically tells us find // all hits instead of the ones surrounded by one of our boundary characters. - String keywordTokenRegex = - // If the given search string starts with .*, we ignore our default + String keywordTokenRegex + = // If the given search string starts with .*, we ignore our default // boundary prefix characters (queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS + keywordString @@ -257,63 +256,65 @@ final class RegexQuery implements KeywordSearchQuery { // boundary suffix characters + (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS - Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content); - int offset = 0; + for (String content : content_str) { + Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content); + int offset = 0; - while (hitMatcher.find(offset)) { - StringBuilder snippet = new StringBuilder(); + while (hitMatcher.find(offset)) { + StringBuilder snippet = new StringBuilder(); - //"parent" entries in the index don't have chunk size, so just accept those hits - if (chunkSize != null && hitMatcher.start() >= chunkSize) { - break; - } + //"parent" entries in the index don't have chunk size, so just accept those hits + if (chunkSize != null && hitMatcher.start() >= chunkSize) { + break; + } - String hit = hitMatcher.group(); + String hit = hitMatcher.group(); - // Back the matcher offset up by 1 character as it will have eaten - // a single space/newline/other boundary character at the end of the hit. - // This was causing us to miss hits that appeared consecutively in the - // input where they were separated by a single boundary character. - offset = hitMatcher.end() - 1; + // Back the matcher offset up by 1 character as it will have eaten + // a single space/newline/other boundary character at the end of the hit. + // This was causing us to miss hits that appeared consecutively in the + // input where they were separated by a single boundary character. + offset = hitMatcher.end() - 1; - // Remove any remaining leading and trailing boundary characters. - if (!queryStringContainsWildcardPrefix) { - hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS - } - if (!queryStringContainsWildcardSuffix) { - hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS - } + // Remove any remaining leading and trailing boundary characters. + if (!queryStringContainsWildcardPrefix) { + hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS + } + if (!queryStringContainsWildcardSuffix) { + hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS + } - /* - * If searching for credit card account numbers, do a Luhn check - * on the term and discard it if it does not pass. - */ - if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { - Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit); - if (ccnMatcher.find()) { - final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn")); - if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) { + /* + * If searching for credit card account numbers, do a Luhn check + * on the term and discard it if it does not pass. + */ + if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { + Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit); + if (ccnMatcher.find()) { + final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn")); + if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) { + continue; + } + } else { continue; } - } else { - continue; } - } - /** - * Get the snippet from the document if keyword search is configured - * to use snippets. - */ - if (KeywordSearchSettings.getShowSnippets()) { - int maxIndex = content.length() - 1; - snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1))); - snippet.appendCodePoint(171); - snippet.append(hit); - snippet.appendCodePoint(171); - snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20))); - } + /** + * Get the snippet from the document if keyword search is + * configured to use snippets. + */ + if (KeywordSearchSettings.getShowSnippets()) { + int maxIndex = content.length() - 1; + snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1))); + snippet.appendCodePoint(171); + snippet.append(hit); + snippet.appendCodePoint(171); + snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20))); + } - hits.add(new KeywordHit(docId, snippet.toString(), hit)); + hits.add(new KeywordHit(docId, snippet.toString(), hit)); + } } return hits; } From 7dea03bf599b18f2e45efe8adc77aa7f89e16a83 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Thu, 2 Feb 2017 15:12:40 +0100 Subject: [PATCH 2/3] allow regex queries to match at start and end of content. fix seperation of file name and content --- .../org/sleuthkit/autopsy/keywordsearch/RegexQuery.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java index f188890591..aa56741ec5 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java @@ -238,7 +238,7 @@ final class RegexQuery implements KeywordSearchQuery { final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString(); final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); - ArrayList content_str = (ArrayList) solrDoc.get(Server.Schema.CONTENT_STR.toString()); + final Collection content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString()); // By default, we create keyword hits on whitespace or punctuation character boundaries. // Having a set of well defined boundary characters produces hits that can @@ -250,13 +250,14 @@ final class RegexQuery implements KeywordSearchQuery { String keywordTokenRegex = // If the given search string starts with .*, we ignore our default // boundary prefix characters - (queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS + (queryStringContainsWildcardPrefix ? "" : "(^|" + BOUNDARY_CHARS + ")") //NON-NLS + keywordString // If the given search string ends with .*, we ignore our default // boundary suffix characters - + (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS + + (queryStringContainsWildcardSuffix ? "" : "($|" + BOUNDARY_CHARS + ")"); //NON-NLS - for (String content : content_str) { + for (Object content_obj : content_str) { + String content = (String) content_obj; Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content); int offset = 0; From 7b60e10235f9c47f758517b8cc9e5ccc9386dc0d Mon Sep 17 00:00:00 2001 From: millmanorama Date: Thu, 2 Feb 2017 15:13:31 +0100 Subject: [PATCH 3/3] look at file name even if the indexed text is 0 bytes. fix seperation of file name and content. --- .../org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java index 6a90a688dc..b51f269f2e 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java @@ -222,7 +222,7 @@ class LuceneQuery implements KeywordSearchQuery { * will get picked up in the next one. */ final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString(); final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); - final ArrayList get = (ArrayList) resultDoc.get(Server.Schema.CONTENT_STR.toString()); + final Collection content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString()); double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion()); if (indexSchemaVersion < 2.0) { @@ -230,11 +230,12 @@ class LuceneQuery implements KeywordSearchQuery { matches.add(createKeywordtHit(highlightResponse, docId)); } else { //check against file name and actual content seperately. - for (String content_str : get) { + for (Object content_obj : content) { + String content_str = (String) content_obj; //for new schemas, check that the hit is before the chunk/window boundary. - int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString); + int firstOccurence = StringUtils.indexOf(content_str.toLowerCase(), strippedQueryString.toLowerCase()); //there is no chunksize field for "parent" entries in the index - if (chunkSize != null && firstOccurence > -1 && firstOccurence < chunkSize) { + if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) { matches.add(createKeywordtHit(highlightResponse, docId)); } }