From 3d67b0aeb0c250d83c1758d608a0c2703fd1aeb4 Mon Sep 17 00:00:00 2001 From: esaunders Date: Tue, 7 Feb 2017 16:03:06 -0500 Subject: [PATCH 1/2] Modified message displayed when we are unable to retrieve text for highlighting. --- .../src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties index 39f8de000c..0e54dd5745 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties @@ -66,7 +66,7 @@ ExtractedContentViewer.getTitle=Indexed Text ExtractedContentViewer.getSolrContent.knownFileMsg=

{0} is a known file (based on MD5 hash) and does not have text in the index.

ExtractedContentViewer.getSolrContent.noTxtYetMsg=

{0} does not have text in the index.
It may have no text, not been analyzed yet, or keyword search was not enabled during ingest.

ExtractedContentViewer.getSolrContent.txtBodyItal={0} -HighlightedMatchesSource.getMarkup.noMatchMsg=
Failed to retrieve content for keyword hit. 
The keyword could have been in the file name.
Advance to another page if present, or to view the original text, choose File Text
in the drop down menu to the right...
+HighlightedMatchesSource.getMarkup.noMatchMsg=Failed to retrieve indexed text for keyword hit. Advance to another page if present, or to view the original text, choose File Text in the drop down menu to the right. Alternatively, you may choose to extract file content and search for the hit using an external application (e.g. a text editor). HighlightedMatchesSource.getMarkup.queryFailedMsg=
Failed to retrieve keyword hit results. 
Confirm that Autopsy can connect to the Solr server.
HighlightedMatchesSource.toString=Search Results Installer.reportPortError=Indexing server port {0} is not available. Check if your security software does not block {1} and consider changing {2} in {3} property file in the application user folder. Then try rebooting your system if another process was causing the conflict. From 2147162db0e19994189d43afacd4abfb2e672863 Mon Sep 17 00:00:00 2001 From: esaunders Date: Tue, 7 Feb 2017 16:03:44 -0500 Subject: [PATCH 2/2] Removed trimming of keyword hits. --- .../autopsy/keywordsearch/RegexQuery.java | 38 +++---------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java index aa56741ec5..c434f808be 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2016 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -80,12 +80,6 @@ final class RegexQuery implements KeywordSearchQuery { private boolean escaped; private String escapedQuery; - // These are the valid characters that can appear either before or after a - // keyword hit. We use these characters to try to turn the hit into a - // token that can be more readily matched when it comes to highlighting - // against the Schema.TEXT field later. - private static final String BOUNDARY_CHARS = "[\\s\\[\\]\\(\\)\\,\\\"\\\'\\!\\?\\.\\/\\:\\;\\=\\<\\>\\^\\{\\}]"; //NON-NLS - // Lucene regular expressions do not support the following Java predefined // and POSIX character classes. There are other valid Java character classes // that are not supported by Lucene but we do not check for all of them. @@ -240,31 +234,17 @@ final class RegexQuery implements KeywordSearchQuery { final Collection content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString()); - // By default, we create keyword hits on whitespace or punctuation character boundaries. - // Having a set of well defined boundary characters produces hits that can - // subsequently be matched for highlighting against the tokens produced by - // the standard tokenizer. - // This behavior can be overridden by the user if they give us a search string - // with .* at either the start and/or end of the string. This basically tells us find - // all hits instead of the ones surrounded by one of our boundary characters. - String keywordTokenRegex - = // If the given search string starts with .*, we ignore our default - // boundary prefix characters - (queryStringContainsWildcardPrefix ? "" : "(^|" + BOUNDARY_CHARS + ")") //NON-NLS - + keywordString - // If the given search string ends with .*, we ignore our default - // boundary suffix characters - + (queryStringContainsWildcardSuffix ? "" : "($|" + BOUNDARY_CHARS + ")"); //NON-NLS - for (Object content_obj : content_str) { String content = (String) content_obj; - Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content); + Matcher hitMatcher = Pattern.compile(keywordString).matcher(content); int offset = 0; while (hitMatcher.find(offset)) { StringBuilder snippet = new StringBuilder(); - //"parent" entries in the index don't have chunk size, so just accept those hits + // If the location of the hit is beyond this chunk (i.e. it + // exists in the overlap region), we skip the hit. It will + // show up again as a hit in the chunk following this one. if (chunkSize != null && hitMatcher.start() >= chunkSize) { break; } @@ -277,14 +257,6 @@ final class RegexQuery implements KeywordSearchQuery { // input where they were separated by a single boundary character. offset = hitMatcher.end() - 1; - // Remove any remaining leading and trailing boundary characters. - if (!queryStringContainsWildcardPrefix) { - hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS - } - if (!queryStringContainsWildcardSuffix) { - hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS - } - /* * If searching for credit card account numbers, do a Luhn check * on the term and discard it if it does not pass.