finish implementing all the cases for highlighting

2025-07-16 17:57:43 +00:00 · 2017-02-15 13:48:43 +01:00 · 2017-02-15 13:48:43 +01:00 · 6abe26d6de
commit 6abe26d6de
parent edd03a66c1
5 changed files with 102 additions and 156 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentViewer.java
@ -110,15 +110,13 @@ public class ExtractedContentViewer implements DataContentViewer {
            BlackboardArtifact artifact = nodeLookup.lookup(BlackboardArtifact.class);
            if (hits != null) {
                highlightedHitText = new HighlightedText(content.getId(), hits);
-            } else {
+            } else if (artifact != null && artifact.getArtifactTypeID()
-                if (artifact != null && artifact.getArtifactTypeID()
+                    == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) {
-                        == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) {
+                // if the artifact is an account artifact, get an account text .
-                    // if the artifact is an account artifact, get an account text .
+                highlightedHitText = getAccountsText(content, nodeLookup);
-                    highlightedHitText = getAccountsText(content, nodeLookup);
+            } else if (artifact != null && artifact.getArtifactTypeID()
-                } else if (artifact != null && artifact.getArtifactTypeID()
+                    == BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT.getTypeID()) {
-                        == BlackboardArtifact.ARTIFACT_TYPE.TSK_KEYWORD_HIT.getTypeID()) {
+                highlightedHitText = new HighlightedText(artifact);
                    highlightedHitText = new HighlightedText(artifact);
                }
            }
            if (highlightedHitText != null) {
                indexedTextSources.add(highlightedHitText);
@ -298,16 +296,16 @@ public class ExtractedContentViewer implements DataContentViewer {
            return false;
        }
-        /**
+//        /**
-         * Is there any marked up indexed text in the look up of this node? This
+//         * Is there any marked up indexed text in the look up of this node? This
-         * will be the case if the node is for a keyword hit artifact produced
+//         * will be the case if the node is for a keyword hit artifact produced
-         * by either an ad hoc keyword search result (keyword search toolbar
+//         * by either an ad hoc keyword search result (keyword search toolbar
-         * widgets) or a keyword search by the keyword search ingest module.
+//         * widgets) or a keyword search by the keyword search ingest module.
-         */
+//         */
-        Collection<? extends IndexedText> sources = node.getLookup().lookupAll(IndexedText.class);
+//        Collection<? extends IndexedText> sources = node.getLookup().lookupAll(IndexedText.class);
-        if (sources.isEmpty() == false) {
+//        if (sources.isEmpty() == false) {
-            return true;
+//            return true;
-        }
+//        }
        /*
         * Is there a credit card artifact in the lookup
@ -315,7 +313,9 @@ public class ExtractedContentViewer implements DataContentViewer {
        Collection<? extends BlackboardArtifact> artifacts = node.getLookup().lookupAll(BlackboardArtifact.class);
        if (artifacts != null) {
            for (BlackboardArtifact art : artifacts) {
-                if (art.getArtifactTypeID() == BlackboardArtifact.ARTIFACT_TYPE.TSK_ACCOUNT.getTypeID()) {
+                final int artifactTypeID = art.getArtifactTypeID();
                if (artifactTypeID == TSK_ACCOUNT.getTypeID()
                        || artifactTypeID == TSK_KEYWORD_HIT.getTypeID()) {
                    return true;
                }
            }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
@ -18,7 +18,6 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import com.ibm.icu.text.UnicodeSet;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@ -28,7 +27,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.function.Function;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
 import org.apache.commons.lang.StringEscapeUtils;
@ -40,14 +38,13 @@ import org.apache.solr.common.SolrDocumentList;
 import org.openide.util.Exceptions;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
 import org.sleuthkit.autopsy.coreutils.Version;
 import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;
 import org.sleuthkit.autopsy.keywordsearch.KeywordSearch.QueryType;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;
 /**
@ -89,6 +86,7 @@ class HighlightedText implements IndexedText {
    private boolean isPageInfoLoaded = false;
    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
    private BlackboardArtifact artifact;
    private KeywordSearch.QueryType qt;
    /**
     * This constructor is used when keyword hits are accessed from the ad-hoc
@ -123,24 +121,51 @@ class HighlightedText implements IndexedText {
    }
    private void loadPageInfoFromArtifact() throws TskCoreException, NumberFormatException {
        final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
        this.keywords.add(keyword);
-        KeywordSearch.QueryType qt = KeywordSearch.QueryType.values()[artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE).getValueInt()];
+        final BlackboardAttribute qtAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
        this.keywords.add(artifact.getAttribute(TSK_KEYWORD).getValueString());
        String chunkIDsString = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS).getValueString();
        Set<String> chunkIDs = Arrays.stream(chunkIDsString.split(",")).map(StringUtils::strip).collect(Collectors.toSet());
        for (String solrDocumentId : chunkIDs) {
            int chunkID;
            final int separatorIndex = solrDocumentId.indexOf(Server.CHUNK_ID_SEPARATOR);
            if (-1 != separatorIndex) {
-                chunkID = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1));
+        qt = (qtAttribute != null)
-            } else {
+                ? KeywordSearch.QueryType.values()[qtAttribute.getValueInt()] : null;
-                chunkID = 0;
+        final BlackboardAttribute docIDsArtifact = artifact.getAttribute(TSK_KEYWORD_HIT_DOCUMENT_IDS);
        if (qt == QueryType.REGEX && docIDsArtifact != null) {
            //regex search records the chunks in the artifact
            String chunkIDsString = docIDsArtifact.getValueString();
            Set<String> chunkIDs = Arrays.stream(chunkIDsString.split(",")).map(StringUtils::strip).collect(Collectors.toSet());
            for (String solrDocumentId : chunkIDs) {
                int chunkID;
                final int separatorIndex = solrDocumentId.indexOf(Server.CHUNK_ID_SEPARATOR);
                if (-1 != separatorIndex) {
                    chunkID = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1));
                } else {
                    chunkID = 0;
                }
                pages.add(chunkID);
                numberOfHitsPerPage.put(chunkID, 0);
                currentHitPerPage.put(chunkID, 0);
            }
            this.currentPage = pages.stream().sorted().findFirst().orElse(1);
            isPageInfoLoaded = true;
        } else {
            /*
             * non-regex searches don't record the chunks in the artifacts, so
             * we need to look them up
             */
            Keyword keywordQuery = new Keyword(keyword, true);
            KeywordSearchQuery chunksQuery
                    = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery);
            chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
            try {
                hits = chunksQuery.performQuery();
                loadPageInfoFromHits();
            } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
                logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS
                MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage());
            }
            pages.add(chunkID);
            numberOfHitsPerPage.put(chunkID, 0);
            currentHitPerPage.put(chunkID, 0);
        }
    }
@ -154,45 +179,6 @@ class HighlightedText implements IndexedText {
     *
     * @return
     */
    static private String getHighlightQuery(KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, Content content) {
        if (literal_query) {
            //literal, treat as non-regex, non-term component query
            return constructEscapedSolrQuery(query.getQueryString());
        } else //construct a Solr query using aggregated terms to get highlighting
        //the query is executed later on demand
        {
            if (queryResults.getKeywords().size() == 1) {
                //simple case, no need to process subqueries and do special escaping
                Keyword keyword = queryResults.getKeywords().iterator().next();
                return constructEscapedSolrQuery(keyword.getSearchTerm());
            } else {
                //find terms for this content hit
                List<Keyword> hitTerms = new ArrayList<>();
                for (Keyword keyword : queryResults.getKeywords()) {
                    for (KeywordHit hit : queryResults.getResults(keyword)) {
                        if (hit.getContent().equals(content)) {
                            hitTerms.add(keyword);
                            break; //go to next term
                        }
                    }
                }
                StringBuilder highlightQuery = new StringBuilder();
                final int lastTerm = hitTerms.size() - 1;
                int curTerm = 0;
                for (Keyword term : hitTerms) {
                    //escape subqueries, MAKE SURE they are not escaped again later
                    highlightQuery.append(constructEscapedSolrQuery(term.getSearchTerm()));
                    if (lastTerm != curTerm) {
                        highlightQuery.append(" "); //acts as OR ||
                    }
                    ++curTerm;
                }
                return highlightQuery.toString();
            }
        }
    }
    /**
     * Constructs a complete, escaped Solr query that is ready to be used.
@ -236,9 +222,7 @@ class HighlightedText implements IndexedText {
             */ loadPageInfoFromArtifact();
        } else if (hasChunks) {
            // if the file has chunks, get pages with hits, sorted
-            if (loadPageInfoFromHits()) {
+            loadPageInfoFromHits();
                //JMTOD: look at error handeling and return values...
            }
        } else {
            //non-regex, no chunks
            this.numberPages = 1;
@ -246,29 +230,12 @@ class HighlightedText implements IndexedText {
            numberOfHitsPerPage.put(1, 0);
            pages.add(1);
            currentHitPerPage.put(1, 0);
            isPageInfoLoaded = true;
        }
-        isPageInfoLoaded = true;
+
    }
-    private boolean loadPageInfoFromHits() {
+    private void loadPageInfoFromHits() {
 //        /*
 //         * If this is being called from the artifacts / dir tree, then we need
 //         * to perform the search to get the highlights.
 //         */
 //        if (hits == null) {
 //
 //            Keyword keywordQuery = new Keyword(keywordHitQuery, true);
 //            KeywordSearchQuery chunksQuery 
 //                    = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery);
 //            chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
 //            try {
 //                hits = chunksQuery.performQuery();
 //            } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
 //                logger.log(Level.SEVERE, "Could not perform the query to get chunk info and get highlights:" + keywordQuery.getSearchTerm(), ex); //NON-NLS
 //                MessageNotifyUtil.Notify.error(Bundle.HighlightedText_query_exception_msg() + keywordQuery.getSearchTerm(), ex.getCause().getMessage());
 //                return true;
 //            }
 ////        }
        //organize the hits by page, filter as needed
        TreeSet<Integer> pagesSorted = new TreeSet<>();
@ -277,8 +244,9 @@ class HighlightedText implements IndexedText {
                int chunkID = hit.getChunkId();
                if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
                    pagesSorted.add(chunkID);
-
+                    if (StringUtils.isNotBlank(hit.getHit())) {
-                    this.keywords.add(hit.getHit());
+                        this.keywords.add(hit.getHit());
                    }
                }
            }
        }
@ -293,7 +261,7 @@ class HighlightedText implements IndexedText {
            pages.add(page);
            currentHitPerPage.put(page, 0); //set current hit to 0th
        }
-        return false;
+        isPageInfoLoaded = true;
    }
    @Override
@ -410,26 +378,29 @@ class HighlightedText implements IndexedText {
        }
        final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
-//        if (isRegex) {
+        if (artifact != null && qt == QueryType.REGEX) {
-        q.setQuery(filterQuery);
+            q.setQuery(filterQuery);
-        q.addField(Server.Schema.CONTENT_STR.toString());
+            q.addField(Server.Schema.CONTENT_STR.toString());
-//        } else {
+        } else {
-//            // input query has already been properly constructed and escaped
+            final String highlightQuery = keywords.stream()
-//            q.setQuery(keywordHitQuery);
+                    .map(HighlightedText::constructEscapedSolrQuery)
-//            q.addField(Server.Schema.TEXT.toString());
+                    .collect(Collectors.joining(" "));
-//            q.addFilterQuery(filterQuery);
+
-//            q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
+            q.setQuery(highlightQuery);
-//            q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+            q.addField(Server.Schema.TEXT.toString());
-//
+            q.addFilterQuery(filterQuery);
-//            //tune the highlighter
+            q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
-//            q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
+            q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
-//            q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
+
-//            q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
+            //tune the highlighter
-//            q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
+            q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
-//
+            q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
-//            //docs says makes sense for the original Highlighter only, but not really
+            q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
-//            q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
+            q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
-//        }
+
            //docs says makes sense for the original Highlighter only, but not really
            q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
        }
        try {
            QueryResponse response = solrServer.query(q, METHOD.POST);
@ -532,8 +503,7 @@ class HighlightedText implements IndexedText {
        for (String unquotedKeyword : keywords) {
            int textOffset = 0;
            int hitOffset;
-
+            while ((hitOffset = StringUtils.indexOfIgnoreCase(text, unquotedKeyword, textOffset)) != -1) {
            while ((hitOffset = text.indexOf(unquotedKeyword, textOffset)) != -1) {
                // Append the portion of text up to (but not including) the hit.
                highlightedText.append(text.substring(textOffset, hitOffset));
                // Add in the highlighting around the keyword.
@ -542,12 +512,11 @@ class HighlightedText implements IndexedText {
                highlightedText.append(HIGHLIGHT_POST);
                // Advance the text offset past the keyword.
-                textOffset = hitOffset + unquotedKeyword.length() + 1;
+                textOffset = hitOffset + unquotedKeyword.length();
            }
-
+            // Append the remainder of text field
            highlightedText.append(text.substring(textOffset, text.length()));
            if (highlightedText.length() > 0) {
                // Append the remainder of text field and return.
                highlightedText.append(text.substring(textOffset, text.length()));
            } else {
                return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchFilterNode.java
@ -45,7 +45,7 @@ import org.sleuthkit.datamodel.File;
 */
 class KeywordSearchFilterNode extends FilterNode {
-    KeywordSearchFilterNode(HighlightedText highlights, Node original) {
+    KeywordSearchFilterNode(QueryResults highlights, Node original) {
        super(original, null, new ProxyLookup(Lookups.singleton(highlights), original.getLookup()));
    }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java
@ -147,7 +147,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        int hitNumber = 0;
        List<KeyValueQueryContent> tempList = new ArrayList<>();
 //        final SetMultimap<Long, KeywordHit> orgnizeByDocID = orgnizeByDocID(queryResults);
        for (KeywordHit hit : getOneHitPerObject(queryResults)) {
            /**
@ -169,12 +168,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
                properties.put(TSK_KEYWORD_PREVIEW.getDisplayName(), hit.getSnippet());
            }
            //JMTODO: I don't understand this comment or the below code... 
            //@@@ USE ConentHit in UniqueFileMap instead of the below search
            //get unique match result files
            // BC: @@@ THis is really ineffecient.  We should keep track of this when
            // we flattened the list of files to the unique files.            
 //            final String highlightQueryEscaped = getHighlightQuery(queryRequest, queryRequest.isLiteral(), queryResults, content);
            String hitName = hit.isArtifactHit()
                    ? hit.getArtifact().getDisplayName() + " Artifact" //NON-NLS
                    : contentName;
@ -220,18 +213,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        return hits.values();
    }
    SetMultimap<Long, KeywordHit> orgnizeByDocID(QueryResults queryResults) {
        SetMultimap<Long, KeywordHit> hits = TreeMultimap.create(Long::compare, Comparator.comparing(KeywordHit::getChunkId));
        for (Keyword keyWord : queryResults.getKeywords()) {
            for (KeywordHit hit : queryResults.getResults(keyWord)) {
                hits.put(hit.getSolrObjectId(), hit);
            }
        }
        return hits;
    }
    @Override
    protected Node createNodeForKey(KeyValueQueryContent key) {
        final Content content = key.getContent();
@ -240,9 +221,7 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
        Node kvNode = new KeyValueNode(key, Children.LEAF, Lookups.singleton(content));
        //wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization
-        // store the data in HighlightedMatchesSource so that it can be looked up (in content viewer)
+        return new KeywordSearchFilterNode(hits, kvNode);
        HighlightedText highlights = new HighlightedText(key.getSolrObjectId(), hits);
        return new KeywordSearchFilterNode(highlights, kvNode);
    }
    /**
@ -277,8 +256,6 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
            this.hits = hits;
            this.query = query;
 //            boolean isRegex = hits.getQuery().isLiteral() == false;
 //            this.chunkIDs = chunkIDs;
        }
        Content getContent() {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@ -239,7 +239,7 @@ class LuceneQuery implements KeywordSearchQuery {
                        for (Object content_obj : content) {
                            String content_str = (String) content_obj;
                            //for new schemas, check that the hit is before the chunk/window boundary.
-                            int firstOccurence = StringUtils.indexOf(content_str.toLowerCase(), strippedQueryString.toLowerCase());
+                            int firstOccurence = StringUtils.indexOfIgnoreCase(content_str, strippedQueryString);
                            //there is no chunksize field for "parent" entries in the index
                            if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) {
                                matches.add(createKeywordtHit(highlightResponse, docId));
@ -324,7 +324,7 @@ class LuceneQuery implements KeywordSearchQuery {
            }
        }
-        return new KeywordHit(docId, snippet);
+        return new KeywordHit(docId, snippet, keywordString);
    }
    /**