Merge pull request #3187 from esaunders/release-4.5.1

Reduce memory consumed by keyword search hits
2025-07-15 01:07:42 +00:00 · 2017-11-13 13:26:28 -05:00 · 2017-11-13 13:26:28 -05:00 · bb0f25ae02
commit bb0f25ae02
parent b4d2cbd447 0a6b3bc62b
6 changed files with 93 additions and 41 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
@ -205,7 +205,13 @@ class HighlightedText implements IndexedText {
     */
    synchronized private void loadPageInfoFromHits() {
        isLiteral = hits.getQuery().isLiteral();
-        //organize the hits by page, filter as needed
+
+        /**
+         * Organize the hits by page, filter as needed.
+         * We process *every* keyword here because in the case of a regular
+         * expression search there may be multiple different keyword
+         * hits located in different chunks for the same file/artifact.
+         */
        for (Keyword k : hits.getKeywords()) {
            for (KeywordHit hit : hits.getResults(k)) {
                int chunkID = hit.getChunkId();
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordHit.java
@ -38,12 +38,9 @@ class KeywordHit implements Comparable<KeywordHit> {

    private static final String GET_CONTENT_ID_FROM_ARTIFACT_ID = "SELECT obj_id FROM blackboard_artifacts WHERE artifact_id = ";

-    private final String solrDocumentId;
    private final long solrObjectId;
    private final int chunkId;
    private final String snippet;
-    private final long contentID;
-    private final boolean hitOnArtifact;
    private final String hit;

    /**
@ -56,14 +53,10 @@ class KeywordHit implements Comparable<KeywordHit> {
     *                       For some searches (ie substring, regex) this will be
     *                       different than the search term.
     *
-     * @throws TskCoreException If there is a problem getting the underlying
-     *                          content associated with a hit on the text of an
-     *                          artifact.
     */
-    KeywordHit(String solrDocumentId, String snippet, String hit) throws TskCoreException {
+    KeywordHit(String solrDocumentId, String snippet, String hit) {
        this.snippet = StringUtils.stripToEmpty(snippet);
        this.hit = hit;
-        this.solrDocumentId = solrDocumentId;

        /*
         * Parse the Solr document id to get the Solr object id and chunk id.
@ -83,28 +76,6 @@ class KeywordHit implements Comparable<KeywordHit> {
            this.solrObjectId = Long.parseLong(split[0]);
            this.chunkId = Integer.parseInt(split[1]);
        }
-
-        //artifacts have negative obj ids
-        hitOnArtifact = this.solrObjectId < 0;
-
-        if (hitOnArtifact) {
-            // If the hit was in an artifact, look up the source content for the artifact.
-            SleuthkitCase caseDb = Case.getCurrentCase().getSleuthkitCase();
-            try (SleuthkitCase.CaseDbQuery executeQuery =
-                    caseDb.executeQuery(GET_CONTENT_ID_FROM_ARTIFACT_ID + this.solrObjectId);
-                    ResultSet resultSet = executeQuery.getResultSet();) {
-                if (resultSet.next()) {
-                    contentID = resultSet.getLong("obj_id");
-                } else {
-                    throw new TskCoreException("Failed to get obj_id for artifact with artifact_id =" + this.solrObjectId + ".  No matching artifact was found.");
-                }
-            } catch (SQLException ex) {
-                throw new TskCoreException("Error getting obj_id for artifact with artifact_id =" + this.solrObjectId, ex);
-            }
-        } else {
-            //else the object id is for content.
-            contentID = this.solrObjectId;
-        }
    }

    String getHit() {
@ -112,7 +83,7 @@ class KeywordHit implements Comparable<KeywordHit> {
    }

    String getSolrDocumentId() {
-        return this.solrDocumentId;
+        return Long.toString(solrObjectId) + Server.CHUNK_ID_SEPARATOR + Long.toString(chunkId);
    }

    long getSolrObjectId() {
@ -131,8 +102,36 @@ class KeywordHit implements Comparable<KeywordHit> {
        return this.snippet;
    }

-    long getContentID() {
-        return this.contentID;
+    /**
+     * Get the content id associated with the content underlying hit. 
+     * For hits on files this will be the same as the object id associated 
+     * with the file. For hits on artifacts we look up the id of the object
+     * that produced the artifact.
+     * 
+     * @return The id of the underlying content associated with the hit.
+     * @throws TskCoreException If there is a problem getting the underlying
+     *                          content associated with a hit on the text of an
+     *                          artifact.
+     */
+    long getContentID() throws TskCoreException {
+        if (isArtifactHit()) {
+            // If the hit was in an artifact, look up the source content for the artifact.
+            SleuthkitCase caseDb = Case.getCurrentCase().getSleuthkitCase();
+            try (SleuthkitCase.CaseDbQuery executeQuery =
+                    caseDb.executeQuery(GET_CONTENT_ID_FROM_ARTIFACT_ID + this.solrObjectId);
+                    ResultSet resultSet = executeQuery.getResultSet();) {
+                if (resultSet.next()) {
+                    return resultSet.getLong("obj_id");
+                } else {
+                    throw new TskCoreException("Failed to get obj_id for artifact with artifact_id =" + this.solrObjectId + ".  No matching artifact was found.");
+                }
+            } catch (SQLException ex) {
+                throw new TskCoreException("Error getting obj_id for artifact with artifact_id =" + this.solrObjectId, ex);
+            }
+        } else {
+            //else the object id is for content.
+            return this.solrObjectId;
+        }
    }

    /**
@ -141,7 +140,8 @@ class KeywordHit implements Comparable<KeywordHit> {
     * @return
     */
    boolean isArtifactHit() {
-        return hitOnArtifact;
+        // artifacts have negative obj ids
+        return this.solrObjectId < 0;
    }

    /**
@ -150,7 +150,7 @@ class KeywordHit implements Comparable<KeywordHit> {
     * @return The artifact whose indexed text this hit is in.
     */
    Optional<Long> getArtifactID() {
-        if (hitOnArtifact) {
+        if (isArtifactHit()) {
            return Optional.of(solrObjectId);
        } else {
            return Optional.empty();
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RegexQuery.java
@ -226,6 +226,8 @@ final class RegexQuery implements KeywordSearchQuery {

    private List<KeywordHit> createKeywordHits(SolrDocument solrDoc) throws TskCoreException {

+        final HashMap<String, String> keywordsFoundInThisDocument = new HashMap<>();
+
        List<KeywordHit> hits = new ArrayList<>();
        final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
        final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
@ -273,6 +275,23 @@ final class RegexQuery implements KeywordSearchQuery {
                        hit = hit.replaceAll("[^0-9]$", "");
                    }

+                    /**
+                     * The use of String interning is an optimization to ensure
+                     * that we reuse the same keyword hit String object across
+                     * all hits. Even though we benefit from G1GC String
+                     * deduplication, the overhead associated with creating a
+                     * new String object for every KeywordHit can be significant
+                     * when the number of hits gets large.
+                     */
+                    hit = hit.intern();
+
+                    // We will only create one KeywordHit instance per document for
+                    // a given hit.
+                    if (keywordsFoundInThisDocument.containsKey(hit)) {
+                        continue;
+                    }
+                    keywordsFoundInThisDocument.put(hit, hit);
+
                    if (artifactAttributeType == null) {
                        hits.add(new KeywordHit(docId, makeSnippet(content, hitMatcher, hit), hit));
                    } else {
@ -303,7 +322,7 @@ final class RegexQuery implements KeywordSearchQuery {
                                        final String group = ccnMatcher.group("ccn");
                                        if (CreditCardValidator.isValidCCN(group)) {
                                            hits.add(new KeywordHit(docId, makeSnippet(content, hitMatcher, hit), hit));
-                                        };
+                                        }
                                    }
                                }

@ -316,8 +335,6 @@ final class RegexQuery implements KeywordSearchQuery {
                }

            }
-        } catch (TskCoreException ex) {
-            throw ex;
        } catch (Throwable error) {
            /*
             * NOTE: Matcher.find() is known to throw StackOverflowError in rare
@ -447,7 +464,7 @@ final class RegexQuery implements KeywordSearchQuery {
                if (hit.isArtifactHit()) {
                    LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getArtifactID().get())); //NON-NLS
                } else {
-                    LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getContentID())); //NON-NLS
+                    LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getSolrObjectId())); //NON-NLS
                }
                return null;
            }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermsComponentQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermsComponentQuery.java
@ -366,7 +366,7 @@ final class TermsComponentQuery implements KeywordSearchQuery {
                if (hit.isArtifactHit()) {
                    LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", searchTerm, hit.getSnippet(), hit.getArtifactID().get())); //NON-NLS
                } else {
-                    LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", searchTerm, hit.getSnippet(), hit.getContentID())); //NON-NLS
+                    LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", searchTerm, hit.getSnippet(), hit.getSolrObjectId())); //NON-NLS
                }
                return null;
            }
--- a/Testing/src/org/sleuthkit/autopsy/testing/AutopsyTestCases.java
+++ b/Testing/src/org/sleuthkit/autopsy/testing/AutopsyTestCases.java
@ -36,6 +36,7 @@ import java.util.logging.Logger;
 import javax.imageio.ImageIO;
 import javax.swing.JDialog;
 import javax.swing.text.JTextComponent;
+import javax.swing.tree.TreePath;
 import org.netbeans.jellytools.MainWindowOperator;
 import org.netbeans.jellytools.NbDialogOperator;
 import org.netbeans.jellytools.WizardOperator;
@ -53,6 +54,8 @@ import org.netbeans.jemmy.operators.JTabbedPaneOperator;
 import org.netbeans.jemmy.operators.JTableOperator;
 import org.netbeans.jemmy.operators.JTextFieldOperator;
 import org.netbeans.jemmy.operators.JToggleButtonOperator;
+import org.netbeans.jemmy.operators.JTreeOperator;
+import org.netbeans.jemmy.operators.JTreeOperator.NoSuchPathException;
 import org.sleuthkit.autopsy.core.UserPreferences;
 import org.sleuthkit.autopsy.core.UserPreferencesException;
 import org.sleuthkit.autopsy.events.MessageServiceConnectionInfo;
@ -290,6 +293,16 @@ public class AutopsyTestCases {

    }

+    public void testExpandDataSourcesTree() {
+        logger.info("Data Sources Node");
+        MainWindowOperator mwo = MainWindowOperator.getDefault();
+        JTreeOperator jto = new JTreeOperator(mwo, "Data Sources");
+        String [] nodeNames = {"Data Sources"};
+        TreePath tp = jto.findPath(nodeNames);
+        expandNodes(jto, tp);
+        screenshot("Data Sources Tree");
+    }
+
    public void testGenerateReportToolbar() {
        logger.info("Generate Report Toolbars");
        MainWindowOperator mwo = MainWindowOperator.getDefault();
@ -380,4 +393,15 @@ public class AutopsyTestCases {
            logger.log(Level.SEVERE, "Error saving messaging service connection info", ex); //NON-NLS
        }
    }
+    
+    private void expandNodes (JTreeOperator jto, TreePath tp) {
+        try {
+            jto.expandPath(tp);
+            for (TreePath t : jto.getChildPaths(tp)) {
+                expandNodes(jto, t);
+            }
+        } catch (NoSuchPathException ne) {
+            logger.log(Level.SEVERE, "Error expanding tree path", ne);
+        }
+    }
 }
--- a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java
+++ b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java
@ -69,6 +69,7 @@ public class RegressionTest extends TestCase {
                    "testConfigureSearch",
                    "testAddSourceWizard1",
                    "testIngest",
+                    "testExpandDataSourcesTree", //After do ingest, before generate report, we expand Data Sources node
                    "testGenerateReportToolbar",
                    "testGenerateReportButton");
        }
@ -83,6 +84,7 @@ public class RegressionTest extends TestCase {
                    "testConfigureSearch",
                    "testAddSourceWizard1",
                    "testIngest",
+                    "testExpandDataSourcesTree", 
                    "testGenerateReportToolbar",
                    "testGenerateReportButton");
        }
@ -147,6 +149,9 @@ public class RegressionTest extends TestCase {
        autopsyTests.testIngest();
    }

+    public void testExpandDataSourcesTree() {
+        autopsyTests.testExpandDataSourcesTree();
+    }
    public void testGenerateReportToolbar() {
        autopsyTests.testGenerateReportToolbar();
    }