finish implementing all the cases for highlighting

2025-07-16 17:57:43 +00:00 · 2017-02-15 13:48:43 +01:00 · 2017-02-15 13:48:43 +01:00 · 6abe26d6de
commit 6abe26d6de
parent edd03a66c1
5 changed files with 102 additions and 156 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java
@ -110,8 +110,7 @@ public class ExtractedContentViewer implements DataContentViewer {
            BlackboardArtifact artifact = nodeLookup.lookup(BlackboardArtifact.class);
            if (hits != null) {
                highlightedHitText = new HighlightedText(content.getId(), hits);
-            } else {
-                if (artifact != null && artifact.getArtifactTypeID()
+            } else if (artifact != null && artifact.getArtifactTypeID()
                    == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) {
                // if the artifact is an account artifact, get an account text .
                highlightedHitText = getAccountsText(content, nodeLookup);
@ -119,7 +118,6 @@ public class ExtractedContentViewer implements DataContentViewer {
                    == BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT.getTypeID()) {
                highlightedHitText = new HighlightedText(artifact);
            }
-            }
            if (highlightedHitText != null) {
                indexedTextSources.add(highlightedHitText);
            }
@ -298,16 +296,16 @@ public class ExtractedContentViewer implements DataContentViewer {
            return false;
        }

-        /**
-         * Is there any marked up indexed text in the look up of this node? This
-         * will be the case if the node is for a keyword hit artifact produced
-         * by either an ad hoc keyword search result (keyword search toolbar
-         * widgets) or a keyword search by the keyword search ingest module.
-         */
-        Collection<? extends IndexedText> sources = node.getLookup().lookupAll(IndexedText.class);
-        if (sources.isEmpty() == false) {
-            return true;
-        }
+//        /**
+//         * Is there any marked up indexed text in the look up of this node? This
+//         * will be the case if the node is for a keyword hit artifact produced
+//         * by either an ad hoc keyword search result (keyword search toolbar
+//         * widgets) or a keyword search by the keyword search ingest module.
+//         */
+//        Collection<? extends IndexedText> sources = node.getLookup().lookupAll(IndexedText.class);
+//        if (sources.isEmpty() == false) {
+//            return true;
+//        }

        /*
         * Is there a credit card artifact in the lookup
@ -315,7 +313,9 @@ public class ExtractedContentViewer implements DataContentViewer {
        Collection<? extends BlackboardArtifact> artifacts = node.getLookup().lookupAll(BlackboardArtifact.class);
        if (artifacts != null) {
            for (BlackboardArtifact art : artifacts) {
-                if (art.getArtifactTypeID() == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) {
+                final int artifactTypeID = art.getArtifactTypeID();
+                if (artifactTypeID == TSK_ACCOUNT.getTypeID()
+                        || artifactTypeID == TSK_KEYWORD_HIT.getTypeID()) {
                    return true;
                }
            }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
@ -18,7 +18,6 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

-import com.ibm.icu.text.UnicodeSet;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@ -28,7 +27,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
-import java.util.function.Function;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
 import org.apache.commons.lang.StringEscapeUtils;
@ -40,14 +38,13 @@ import org.apache.solr.common.SolrDocumentList;
 import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
-import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
 import org.sleuthkit.autopsy.coreutils.Version;
 import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;
+import org.sleuthkit.autopsy.keywordsearch.KeywordSearch.QueryType;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
-import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;

 /**
@ -89,6 +86,7 @@ class HighlightedText implements IndexedText {
    private boolean isPageInfoLoaded = false;
    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
    private BlackboardArtifact artifact;
+    private KeywordSearch.QueryType qt;

    /**
     * This constructor is used when keyword hits are accessed from the ad-hoc
@ -123,16 +121,24 @@ class HighlightedText implements IndexedText {
    }

    private void loadPageInfoFromArtifact() throws TskCoreException, NumberFormatException {
+        final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
+        this.keywords.add(keyword);

-        KeywordSearch.QueryType qt = KeywordSearch.QueryType.values()[artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE).getValueInt()];
-        this.keywords.add(artifact.getAttribute(TSK_KEYWORD).getValueString());
-        String chunkIDsString = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS).getValueString();
+        final BlackboardAttribute qtAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
+
+        qt = (qtAttribute != null)
+                ? KeywordSearch.QueryType.values()[qtAttribute.getValueInt()] : null;
+
+        final BlackboardAttribute docIDsArtifact = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS);
+
+        if (qt == QueryType.REGEX && docIDsArtifact != null) {
+            //regex search records the chunks in the artifact
+            String chunkIDsString = docIDsArtifact.getValueString();
            Set<String> chunkIDs = Arrays.stream(chunkIDsString.split(",")).map(StringUtils::strip).collect(Collectors.toSet());
            for (String solrDocumentId : chunkIDs) {
                int chunkID;
                final int separatorIndex = solrDocumentId.indexOf(Server.CHUNK_ID_SEPARATOR);
                if (-1 != separatorIndex) {
-
                    chunkID = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1));
                } else {

@ -142,6 +148,25 @@ class HighlightedText implements IndexedText {
                numberOfHitsPerPage.put(chunkID, 0);
                currentHitPerPage.put(chunkID, 0);
            }
+            this.currentPage = pages.stream().sorted().findFirst().orElse(1);
+            isPageInfoLoaded = true;
+        } else {
+            /*
+             * non-regex searches don't record the chunks in the artifacts, so
+             * we need to look them up
+             */
+            Keyword keywordQuery = new Keyword(keyword, true);
+            KeywordSearchQuery chunksQuery
+                    = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery);
+            chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
+            try {
+                hits = chunksQuery.performQuery();
+                loadPageInfoFromHits();
+            } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
+                logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS
+                MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage());
+            }
+        }
    }

    /**
@ -154,45 +179,6 @@ class HighlightedText implements IndexedText {
     *
     * @return
     */
-    static private String getHighlightQuery(KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, Content content) {
-        if (literal_query) {
-            //literal, treat as non-regex, non-term component query
-            return constructEscapedSolrQuery(query.getQueryString());
-        } else //construct a Solr query using aggregated terms to get highlighting
-        //the query is executed later on demand
-        {
-            if (queryResults.getKeywords().size() == 1) {
-                //simple case, no need to process subqueries and do special escaping
-                Keyword keyword = queryResults.getKeywords().iterator().next();
-                return constructEscapedSolrQuery(keyword.getSearchTerm());
-            } else {
-                //find terms for this content hit
-                List<Keyword> hitTerms = new ArrayList<>();
-                for (Keyword keyword : queryResults.getKeywords()) {
-                    for (KeywordHit hit : queryResults.getResults(keyword)) {
-                        if (hit.getContent().equals(content)) {
-                            hitTerms.add(keyword);
-                            break; //go to next term
-                        }
-                    }
-                }
-
-                StringBuilder highlightQuery = new StringBuilder();
-                final int lastTerm = hitTerms.size() - 1;
-                int curTerm = 0;
-                for (Keyword term : hitTerms) {
-                    //escape subqueries, MAKE SURE they are not escaped again later
-                    highlightQuery.append(constructEscapedSolrQuery(term.getSearchTerm()));
-                    if (lastTerm != curTerm) {
-                        highlightQuery.append(" "); //acts as OR ||
-                    }
-
-                    ++curTerm;
-                }
-                return highlightQuery.toString();
-            }
-        }
-    }

    /**
     * Constructs a complete, escaped Solr query that is ready to be used.
@ -236,9 +222,7 @@ class HighlightedText implements IndexedText {
             */ loadPageInfoFromArtifact();
        } else if (hasChunks) {
            // if the file has chunks, get pages with hits, sorted
-            if (loadPageInfoFromHits()) {
-                //JMTOD: look at error handeling and return values...
-            }
+            loadPageInfoFromHits();
        } else {
            //non-regex, no chunks
            this.numberPages = 1;
@ -246,29 +230,12 @@ class HighlightedText implements IndexedText {
            numberOfHitsPerPage.put(1, 0);
            pages.add(1);
            currentHitPerPage.put(1, 0);
-        }
            isPageInfoLoaded = true;
        }

-    private boolean loadPageInfoFromHits() {
-//        /*
-//         * If this is being called from the artifacts / dir tree, then we need
-//         * to perform the search to get the highlights.
-//         */
-//        if (hits == null) {
-//
-//            Keyword keywordQuery = new Keyword(keywordHitQuery, true);
-//            KeywordSearchQuery chunksQuery 
-//                    = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery);
-//            chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
-//            try {
-//                hits = chunksQuery.performQuery();
-//            } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
-//                logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS
-//                MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage());
-//                return true;
-//            }
-////        }
+    }
+
+    private void loadPageInfoFromHits() {
        //organize the hits by page, filter as needed
        TreeSet<Integer> pagesSorted = new TreeSet<>();

@ -277,11 +244,12 @@ class HighlightedText implements IndexedText {
                int chunkID = hit.getChunkId();
                if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
                    pagesSorted.add(chunkID);
-
+                    if (StringUtils.isNotBlank(hit.getHit())) {
                        this.keywords.add(hit.getHit());
                    }
                }
            }
+        }
        //set page to first page having highlights
        if (pagesSorted.isEmpty()) {
            this.currentPage = 0;
@ -293,7 +261,7 @@ class HighlightedText implements IndexedText {
            pages.add(page);
            currentHitPerPage.put(page, 0); //set current hit to 0th
        }
-        return false;
+        isPageInfoLoaded = true;
    }

    @Override
@ -410,26 +378,29 @@ class HighlightedText implements IndexedText {
        }
        final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);

-//        if (isRegex) {
+        if (artifact != null && qt == QueryType.REGEX) {
            q.setQuery(filterQuery);
            q.addField(Server.Schema.CONTENT_STR.toString());
-//        } else {
-//            // input query has already been properly constructed and escaped
-//            q.setQuery(keywordHitQuery);
-//            q.addField(Server.Schema.TEXT.toString());
-//            q.addFilterQuery(filterQuery);
-//            q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
-//            q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
-//
-//            //tune the highlighter
-//            q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
-//            q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
-//            q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
-//            q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
-//
-//            //docs says makes sense for the original Highlighter only, but not really
-//            q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
-//        }
+        } else {
+            final String highlightQuery = keywords.stream()
+                    .map(HighlightedText::constructEscapedSolrQuery)
+                    .collect(Collectors.joining(" "));
+
+            q.setQuery(highlightQuery);
+            q.addField(Server.Schema.TEXT.toString());
+            q.addFilterQuery(filterQuery);
+            q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
+            q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+
+            //tune the highlighter
+            q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
+            q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
+            q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
+            q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
+
+            //docs says makes sense for the original Highlighter only, but not really
+            q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
+        }
        try {
            QueryResponse response = solrServer.query(q, METHOD.POST);

@ -532,8 +503,7 @@ class HighlightedText implements IndexedText {
        for (String unquotedKeyword : keywords) {
            int textOffset = 0;
            int hitOffset;
-
-            while ((hitOffset = text.indexOf(unquotedKeyword, textOffset)) != -1) {
+            while ((hitOffset = StringUtils.indexOfIgnoreCase(text, unquotedKeyword, textOffset)) != -1) {
                // Append the portion of text up to (but not including) the hit.
                highlightedText.append(text.substring(textOffset, hitOffset));
                // Add in the highlighting around the keyword.
@ -542,12 +512,11 @@ class HighlightedText implements IndexedText {
                highlightedText.append(HIGHLIGHT_POST);

                // Advance the text offset past the keyword.
-                textOffset = hitOffset + unquotedKeyword.length() + 1;
+                textOffset = hitOffset + unquotedKeyword.length();
            }
-
-            if (highlightedText.length() > 0) {
-                // Append the remainder of text field and return.
+            // Append the remainder of text field
            highlightedText.append(text.substring(textOffset, text.length()));
+            if (highlightedText.length() > 0) {

            } else {
                return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java
@ -45,7 +45,7 @@ import org.sleuthkit.datamodel.File;
 */
 class KeywordSearchFilterNode extends FilterNode {

-    KeywordSearchFilterNode(HighlightedText highlights, Node original) {
+    KeywordSearchFilterNode(QueryResults highlights, Node original) {
        super(original, null, new ProxyLookup(Lookups.singleton(highlights), original.getLookup()));
    }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
@ -147,7 +147,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {

        int hitNumber = 0;
        List<KeyValueQueryContent> tempList = new ArrayList<>();
-//        final SetMultimap<Long, KeywordHit> orgnizeByDocID = orgnizeByDocID(queryResults);
        for (KeywordHit hit : getOneHitPerObject(queryResults)) {

            /**
@ -169,12 +168,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
                properties.put(TSK_KEYWORD_PREVIEW.getDisplayName(), hit.getSnippet());
            }

-            //JMTODO: I don't understand this comment or the below code... 
-            //@@@ USE ConentHit in UniqueFileMap instead of the below search
-            //get unique match result files
-            // BC: @@@ THis is really ineffecient.  We should keep track of this when
-            // we flattened the list of files to the unique files.            
-//            final String highlightQueryEscaped = getHighlightQuery(queryRequest, queryRequest.isLiteral(), queryResults, content);
            String hitName = hit.isArtifactHit()
                    ? hit.getArtifact().getDisplayName() + " Artifact" //NON-NLS
                    : contentName;
@ -220,18 +213,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        return hits.values();
    }

-    SetMultimap<Long, KeywordHit> orgnizeByDocID(QueryResults queryResults) {
-        SetMultimap<Long, KeywordHit> hits = TreeMultimap.create(Long::compare, Comparator.comparing(KeywordHit::getChunkId));
-
-        for (Keyword keyWord : queryResults.getKeywords()) {
-            for (KeywordHit hit : queryResults.getResults(keyWord)) {
-
-                hits.put(hit.getSolrObjectId(), hit);
-            }
-        }
-        return hits;
-    }
-
    @Override
    protected Node createNodeForKey(KeyValueQueryContent key) {
        final Content content = key.getContent();
@ -240,9 +221,7 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        Node kvNode = new KeyValueNode(key, Children.LEAF, Lookups.singleton(content));

        //wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization
-        // store the data in HighlightedMatchesSource so that it can be looked up (in content viewer)
-        HighlightedText highlights = new HighlightedText(key.getSolrObjectId(), hits);
-        return new KeywordSearchFilterNode(highlights, kvNode);
+        return new KeywordSearchFilterNode(hits, kvNode);
    }

    /**
@ -277,8 +256,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {

            this.hits = hits;
            this.query = query;
-//            boolean isRegex = hits.getQuery().isLiteral() == false;
-//            this.chunkIDs = chunkIDs;
        }

        Content getContent() {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@ -239,7 +239,7 @@ class LuceneQuery implements KeywordSearchQuery {
                        for (Object content_obj : content) {
                            String content_str = (String) content_obj;
                            //for new schemas, check that the hit is before the chunk/window boundary.
-                            int firstOccurence = StringUtils.indexOf(content_str.toLowerCase(), strippedQueryString.toLowerCase());
+                            int firstOccurence = StringUtils.indexOfIgnoreCase(content_str, strippedQueryString);
                            //there is no chunksize field for "parent" entries in the index
                            if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) {
                                matches.add(createKeywordtHit(highlightResponse, docId));
@ -324,7 +324,7 @@ class LuceneQuery implements KeywordSearchQuery {
            }
        }

-        return new KeywordHit(docId, snippet);
+        return new KeywordHit(docId, snippet, keywordString);
    }

    /**