Modified creation of regex keyword hits to break on a whitespace or punctuation boundary to support consistent highlighting. Also added HighlightedText.attemptManualHighlighting() for those situations where the Lucene highlighter doesn't give us useful results.

2025-07-17 10:17:41 +00:00 · 2016-12-27 17:00:00 -05:00 · 2016-12-27 17:00:00 -05:00 · 0e925e6823
commit 0e925e6823
parent 4b80395b9d
5 changed files with 208 additions and 76 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java
@ -61,7 +61,7 @@ class AccountsText implements IndexedText {
    private static final String INSERT_POSTFIX = "'></a>$0"; //$0 will insert current regex match  //NON-NLS
    private static final Pattern ANCHOR_DETECTION_PATTERN = Pattern.compile(HIGHLIGHT_PRE);

-    private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
+    private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD;

    private final Server solrServer;
    private final String solrDocumentId;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
@ -25,12 +25,15 @@ import java.util.List;
 import java.util.Map;
 import java.util.TreeSet;
 import java.util.logging.Level;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.commons.lang.StringUtils;

 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest.METHOD;
 import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocumentList;
 import org.openide.util.NbBundle.Messages;
 import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
 import org.sleuthkit.autopsy.coreutils.Version;
@ -82,9 +85,23 @@ class HighlightedText implements IndexedText, TextMarkupLookup {

    }

-    //when the results are not known and need to requery to get hits
-    HighlightedText(long objectId, String solrQuery, boolean isRegex, String originalQuery) {
-        this(objectId, KeywordSearchUtil.quoteQuery(solrQuery), isRegex);
+    /**
+     * This constructor is used when keyword hits are accessed from the
+     * "Keyword Hits" node in the directory tree in Autopsy.
+     * In that case we only have the keyword for which a hit had
+     * previously been found so we will need to re-query to find hits
+     * for the keyword.
+     *
+     * @param objectId
+     * @param keyword The keyword that was found previously (e.g. during ingest)
+     * @param isRegex true if the keyword was found via a regular expression search
+     * @param originalQuery The original query string that produced the hit. If
+     * isRegex is true, this will be the regular expression that produced the hit.
+     */
+    HighlightedText(long objectId, String keyword, boolean isRegex, String originalQuery) {
+        // The keyword can be treated as a literal hit at this point so we
+        // surround it in quotes.
+        this(objectId, KeywordSearchUtil.quoteQuery(keyword), isRegex);
        this.originalQuery = originalQuery;
    }

@ -123,25 +140,15 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
            hasChunks = true;
        }

-        //if has chunks, get pages with hits
+        // if the file has chunks, get pages with hits, sorted
        if (hasChunks) {
-            //extract pages of interest, sorted
-
            /*
             * If this is being called from the artifacts / dir tree, then we
             * need to perform the search to get the highlights.
             */
            if (hits == null) {
-                // I don't undertand how we could get into this code with a regex query.
-                // Won't all regex queries have been resolved to actual literal keyword hits
-                // by the time we attempt to load page content? EGS.
-//                String queryStr = KeywordSearchUtil.escapeLuceneQuery(this.keywordHitQuery);
-//                if (isRegex) {
-//                    //use white-space sep. field to get exact matches only of regex query result
-//                    queryStr = Server.Schema.CONTENT_WS + ":" + "\"" + queryStr + "\"";
-//                }
+                Keyword keywordQuery = new Keyword(keywordHitQuery, !isRegex);

-                Keyword keywordQuery = new Keyword(this.keywordHitQuery, !isRegex);
                List<Keyword> keywords = new ArrayList<>();
                keywords.add(keywordQuery);
                KeywordSearchQuery chunksQuery = new LuceneQuery(new KeywordList(keywords), keywordQuery);
@ -303,14 +310,6 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
    public String getText() {
        loadPageInfo(); //inits once

-        String highLightField = null;
-
-        if (isRegex) {
-            highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
-        } else {
-            highLightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
-        }
-
        SolrQuery q = new SolrQuery();
        q.setShowDebugInfo(DEBUG); //debug

@ -324,10 +323,8 @@ class HighlightedText implements IndexedText, TextMarkupLookup {

        final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
        q.addFilterQuery(filterQuery);
-        q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)
+        q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);

-        //q.setHighlightSimplePre(HIGHLIGHT_PRE); //original highlighter only
-        //q.setHighlightSimplePost(HIGHLIGHT_POST); //original highlighter only
        q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH

        //tune the highlighter
@ -341,22 +338,33 @@ class HighlightedText implements IndexedText, TextMarkupLookup {

        try {
            QueryResponse response = solrServer.query(q, METHOD.POST);
+
+            // There should never be more than one document since there will 
+            // either be a single chunk containing hits or we narrow our
+            // query down to the current page/chunk.
+            if (response.getResults().size() > 1) {
+                logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", keywordHitQuery); //NON-NLS
+            }
+
            Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();

            Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
-            if (responseHighlightID == null) {
-                return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
-            }
-            List<String> contentHighlights = responseHighlightID.get(highLightField);
-            if (contentHighlights == null) {
-                return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
-            } else {
-                // extracted content (minus highlight tags) is HTML-escaped
-                String highlightedContent = contentHighlights.get(0).trim();
-                highlightedContent = insertAnchors(highlightedContent);
+            String highlightedContent;

-                return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
+            if (responseHighlightID == null) {
+                highlightedContent = attemptManualHighlighting(response.getResults());
+            } else {
+                List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
+                if (contentHighlights == null) {
+                    highlightedContent = attemptManualHighlighting(response.getResults());
+                } else {
+                    // extracted content (minus highlight tags) is HTML-escaped
+                    highlightedContent = contentHighlights.get(0).trim();
+                }
            }
+            highlightedContent = insertAnchors(highlightedContent);
+
+            return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
        } catch (Exception ex) {
            logger.log(Level.WARNING, "Error executing Solr highlighting query: " + keywordHitQuery, ex); //NON-NLS
            return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg");
@ -386,6 +394,73 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
        return this.hitsPages.get(this.currentPage);
    }

+    /**
+     * If the Solr query does not produce valid highlighting, we attempt to
+     * add the highlighting ourselves. We do this by taking the text returned
+     * from the document that contains a hit and searching that text for the
+     * keyword that produced the hit.
+     *
+     * @param solrDocumentList The list of Solr documents returned in response
+     * to a Solr query. We expect there to only ever be a single document.
+     *
+     * @return Either a string with the keyword highlighted or a string
+     * indicating that we did not find a hit in the document.
+     */
+    private String attemptManualHighlighting(SolrDocumentList solrDocumentList) {
+        if (solrDocumentList.isEmpty()) {
+            return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
+        }
+
+        // It doesn't make sense for there to be more than a single document in
+        // the list since this class presents a single page (document) of highlighted
+        // content at a time.
+        String text = solrDocumentList.get(0).getOrDefault(Server.Schema.TEXT.toString(), "").toString();
+
+        // Escape any HTML content that may be in the text. This is needed in
+        // order to correctly display the text in the content viewer.
+        // Must be done before highlighting tags are added. If we were to 
+        // perform HTML escaping after adding the highlighting tags we would
+        // not see highlighted text in the content viewer.
+        text = StringEscapeUtils.escapeHtml(text);
+
+        StringBuilder highlightedText = new StringBuilder("");
+
+        int textOffset = 0;
+        // Remove quotes from around the keyword.
+        String unquotedKeyword = StringUtils.strip(keywordHitQuery, "\"");
+        // Find the first (if any) hit.
+        int hitOffset = text.indexOf(unquotedKeyword, textOffset);
+
+        while (hitOffset != -1) {
+            // Append the portion of text up to (but not including) the hit.
+            highlightedText.append(text.substring(textOffset, hitOffset));
+            // Add in the highlighting around the keyword.
+            highlightedText.append(HIGHLIGHT_PRE);
+            highlightedText.append(unquotedKeyword);
+            highlightedText.append(HIGHLIGHT_POST);
+
+            // Advance the text offset past the keyword.
+            textOffset = hitOffset + unquotedKeyword.length() + 1;
+            // Search for the next keyword hit in the text.
+            hitOffset = text.indexOf(unquotedKeyword, textOffset);
+        }
+
+        if (highlightedText.length() > 0) {
+            // Append the remainder of text field and return.
+            highlightedText.append(text.substring(textOffset, text.length()));
+            return highlightedText.toString();
+        } else {
+            return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
+        }
+    }
+
+    /**
+     * Anchors are used to navigate back and forth between hits on the same
+     * page and to navigate to hits on the next/previous page.
+     *
+     * @param searchableContent
+     * @return
+     */
    private String insertAnchors(String searchableContent) {
        int searchOffset = 0;
        int index = -1;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
@ -298,13 +298,7 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
     */
    private String constructEscapedSolrQuery(String query, boolean literal_query) {
        StringBuilder highlightQuery = new StringBuilder();
-        String highLightField;
-        if (literal_query) {
-            highLightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
-        } else {
-            highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
-        }
-        highlightQuery.append(highLightField).append(":").append("\"").append(KeywordSearchUtil.escapeLuceneQuery(query)).append("\"");
+        highlightQuery.append(LuceneQuery.HIGHLIGHT_FIELD).append(":").append("\"").append(KeywordSearchUtil.escapeLuceneQuery(query)).append("\"");
        return highlightQuery.toString();
    }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@ -59,11 +59,7 @@ class LuceneQuery implements KeywordSearchQuery {
    private String field = null;
    private static final int MAX_RESULTS = 20000;
    static final int SNIPPET_LENGTH = 50;
-    //can use different highlight schema fields for regex and literal search
-    static final String HIGHLIGHT_FIELD_LITERAL = Server.Schema.TEXT.toString();
-    static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.TEXT.toString();
-    //TODO use content_ws stored="true" in solr schema for perfect highlight hits
-    //static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.CONTENT_WS.toString()
+    static final String HIGHLIGHT_FIELD = Server.Schema.TEXT.toString();

    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);

@ -250,13 +246,15 @@ class LuceneQuery implements KeywordSearchQuery {
    private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
        SolrQuery q = new SolrQuery();
        q.setShowDebugInfo(DEBUG); //debug
-        //set query, force quotes/grouping around all literal queries
-        final String groupedQuery = KeywordSearchUtil.quoteQuery(keywordStringEscaped);
-        String theQueryStr = groupedQuery;
+        // Wrap the query string in quotes if this is a literal search term.
+        String theQueryStr = keyword.searchTermIsLiteral() 
+                ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
+
+        // Run the query against an optional alternative field. 
        if (field != null) {
            //use the optional field
            StringBuilder sb = new StringBuilder();
-            sb.append(field).append(":").append(groupedQuery);
+            sb.append(field).append(":").append(theQueryStr);
            theQueryStr = sb.toString();
        }
        q.setQuery(theQueryStr);
@ -345,20 +343,13 @@ class LuceneQuery implements KeywordSearchQuery {
    public static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {
        Server solrServer = KeywordSearch.getServer();

-        String highlightField;
-        if (isRegex) {
-            highlightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
-        } else {
-            highlightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
-        }
-
        SolrQuery q = new SolrQuery();

        String queryStr;

        if (isRegex) {
            StringBuilder sb = new StringBuilder();
-            sb.append(highlightField).append(":");
+            sb.append(LuceneQuery.HIGHLIGHT_FIELD).append(":");
            if (group) {
                sb.append("\"");
            }
@ -387,7 +378,7 @@ class LuceneQuery implements KeywordSearchQuery {
        String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);
        q.setShowDebugInfo(DEBUG); //debug
        q.addFilterQuery(idQuery);
-        q.addHighlightField(highlightField);
+        q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
        //q.setHighlightSimplePre("&laquo;"); //original highlighter only
        //q.setHighlightSimplePost("&raquo;");  //original highlighter only
        q.setHighlightSnippets(1);
@ -413,7 +404,7 @@ class LuceneQuery implements KeywordSearchQuery {
            if (responseHighlightID == null) {
                return "";
            }
-            List<String> contentHighlights = responseHighlightID.get(highlightField);
+            List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
            if (contentHighlights == null) {
                return "";
            } else {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
@ -78,6 +78,19 @@ final class RegexQuery implements KeywordSearchQuery {
    private boolean escaped;
    private String escapedQuery;

+    // These are the valid characters that can appear immediately before a
+    // keyword hit. e.g. for an IP address regex we support finding the string
+    // ",10.0.0.0" but not "?10.0.0.0".
+    private static final String BOUNDARY_PREFIX_CHARS = "(\\s|\\[|\\(|,|\\:)"; //NON-NLS
+
+    // These are the valid characters that can appear immediately after a
+    // keyword hit. e.g. for an IP address regex we support finding the string
+    // "10.0.0.0?]" but not "10.0.0.0&".
+    private static final String BOUNDARY_SUFFIX_CHARS = "(\\s|\\]|\\)|,|!|\\?|\\:)"; //NON-NLS
+
+    private boolean queryStringContainsWildcardPrefix = false;
+    private boolean queryStringContainsWildcardSuffix = false;
+
    /**
     * Constructor with query to process.
     *
@ -88,6 +101,14 @@ final class RegexQuery implements KeywordSearchQuery {
        this.keywordList = keywordList;
        this.keyword = keyword;
        this.keywordString = keyword.getSearchTerm();
+
+        if (this.keywordString.startsWith(".*")) {
+            this.queryStringContainsWildcardPrefix = true;
+        }
+
+        if (this.keywordString.endsWith(".*")) {
+            this.queryStringContainsWildcardSuffix = true;
+        }
    }

    @Override
@ -120,7 +141,28 @@ final class RegexQuery implements KeywordSearchQuery {
        SolrQuery solrQuery = new SolrQuery();
        solrQuery.setShowDebugInfo(true); //debug

-        solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/.*" + getQueryString() + ".*/");
+        /**
+         * The provided regular expression may include wildcards at the
+         * beginning and/or end. These wildcards are used to indicate that
+         * the user wants to find hits for the regex that are embedded
+         * within other characters. For example, if we are given .*127.0.0.1.*
+         * as a regular expression, this will produce hits for:
+         * (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace).
+         * (b) "abc127.0.0.1def" where the IP address is surrounded by other characters.
+         *
+         * If we are given this type of regex, we do not need to add our own
+         * wildcards to anchor the query. Otherwise, we need to add wildcard
+         * anchors because Lucene string regex searches default to using ^ and $
+         * to match the entire string.
+         */
+
+        // We construct the query by surrounding it with slashes (to indicate it is
+        // a regular expression search) and .* as anchors (if the query doesn't
+        // already have them).
+        solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/"
+                + (queryStringContainsWildcardPrefix ? "" : ".*") + getQueryString()
+                + (queryStringContainsWildcardSuffix ? "" : ".*") + "/");
+
        solrQuery.setRows(MAX_RESULTS);

        // Set the fields we want to have returned by the query.
@ -173,22 +215,50 @@ final class RegexQuery implements KeywordSearchQuery {
        List<KeywordHit> hits = new ArrayList<>();
        final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();

-        String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString();
+        String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS

-        Matcher hitMatcher = Pattern.compile(keywordString).matcher(content);
+        // By default, we create keyword hits on whitespace or punctuation character boundaries.
+        // Having a set of well defined boundary characters produces hits that can
+        // subsequently be matched for highlighting against the tokens produced by
+        // the standard tokenizer.
+        // This behavior can be overridden by the user if they give us a search string
+        // with .* at either the start and/or end of the string. This basically tells us find
+        // all hits instead of the ones surrounded by one of our boundary characters.
+        String keywordTokenRegex =
+                // If the given search string starts with .*, we ignore our default
+                // boundary prefix characters
+                (queryStringContainsWildcardPrefix ? "" : BOUNDARY_PREFIX_CHARS) //NON-NLS
+                + keywordString
+                // If the given search string ends with .*, we ignore our default
+                // boundary suffix characters
+                + (queryStringContainsWildcardSuffix ? "" : BOUNDARY_SUFFIX_CHARS); //NON-NLS
+
+        Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);

        while (hitMatcher.find()) {
-            String snippet = "";
-            final String hit = hitMatcher.group();
+            StringBuilder snippet = new StringBuilder();
+            String hit = hitMatcher.group();
+
+            // Remove leading and trailing boundary characters.
+            if (!queryStringContainsWildcardPrefix) {
+                hit = hit.replaceAll("^" + BOUNDARY_PREFIX_CHARS, ""); //NON-NLS
+            }
+            if (!queryStringContainsWildcardSuffix) {
+                hit = hit.replaceAll(BOUNDARY_SUFFIX_CHARS + "$", ""); //NON-NLS
+            }
+
            /*
             * If searching for credit card account numbers, do a Luhn check
             * on the term and discard it if it does not pass.
             */
            if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
                Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
-                ccnMatcher.find();
-                final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
-                if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
+                if (ccnMatcher.find()) {
+                    final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
+                    if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
+                        continue;
+                    }
+                } else {
                    continue;
                }
            }
@ -199,12 +269,14 @@ final class RegexQuery implements KeywordSearchQuery {
             */
            if (KeywordSearchSettings.getShowSnippets()) {
                int maxIndex = content.length() - 1;
-                snippet = content.substring(Integer.max(0, hitMatcher.start() - 30), Integer.max(0, hitMatcher.start() - 1));
-                snippet += "<<" + hit + "<<";
-                snippet += content.substring(Integer.min(maxIndex, hitMatcher.end() + 1), Integer.min(maxIndex, hitMatcher.end() + 30));
+                snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 30), Integer.max(0, hitMatcher.start() - 1)));
+                snippet.appendCodePoint(171);
+                snippet.append(hit);
+                snippet.appendCodePoint(171);
+                snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() + 1), Integer.min(maxIndex, hitMatcher.end() + 30)));
            }

-            hits.add(new KeywordHit(docId, snippet, hit));
+            hits.add(new KeywordHit(docId, snippet.toString(), hit));
        }
        return hits;
    }