Improvements and fixes and caching

2025-07-06 21:00:22 +00:00 · 2023-06-06 16:02:27 -04:00 · 2023-06-06 16:02:27 -04:00 · cd83205382
commit cd83205382
parent e3da0cae14
4 changed files with 71 additions and 30 deletions
--- a/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewerPanel.java
+++ b/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewerPanel.java
@ -99,7 +99,7 @@ public class TextContentViewerPanel extends javax.swing.JPanel implements DataCo

    /**
     * Determine the isPreffered score for the content viewer which is
-     * displaying this panel. Score is depenedent on the score of the supported
+     * displaying this panel. Score is dependent on the score of the supported
     * TextViewers which exist.
     *
     * @param node
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
@ -56,7 +56,15 @@ class ExtractedText implements IndexedText {
    ExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
        this.abstractFile = file;
        this.numPages = -1; // We don't know how many pages there are until we reach end of the document
-        initialize();
+        
+        TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
+
+        Map<String, String> extractedMetadata = new HashMap<>();
+        Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
+
+        //Get a reader for the content of the given source
+        BufferedReader reader = new BufferedReader(sourceReader);
+        this.chunker = new Chunker(reader);
    }

    @Override
@ -164,17 +172,6 @@ class ExtractedText implements IndexedText {
        return numPages;
    }

-    private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
-        TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
-
-        Map<String, String> extractedMetadata = new HashMap<>();
-        Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
-
-        //Get a reader for the content of the given source
-        BufferedReader reader = new BufferedReader(sourceReader);
-        chunker = new Chunker(reader);
-    }
-
    /**
     * Extract text from abstractFile
     *
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
@ -68,6 +68,9 @@ public class ExtractedTextViewer implements TextViewer {
    private IndexedText currentSource = null;
    private FileTypeDetector fileTypeDetector = null;
    
+    private long cachedObjId = -1;
+    private boolean chachedIsFullyIndexed = false;
+
    /**
     * Constructs a text viewer that displays the indexed text associated with a
     * file or an artifact, possibly marked up with HTML to highlight keyword
@ -426,25 +429,39 @@ public class ExtractedTextViewer implements TextViewer {
    }

    /**
-     * Check if Solr has extracted content for a given node
+     * Check if Solr has indexed ALL of the content for a given node. Note that
+     * in some situations Solr only indexes parts of a file. This happens when
+     * an in-line KWS finds a KW hit in the file - only the chunks with the KW
+     * hit (+/- 1 chunk) get indexed by Solr. That is not enough for the
+     * purposes of this text viewer as we need to display all of the text in the
+     * file.
     *
     * @param objectId
     *
     * @return true if Solr has content, else false
     */
    private boolean solrHasFullyIndexedContent(Long objectId) {
-        final Server solrServer = KeywordSearch.getServer();
-        if (solrServer.coreIsOpen() == false) {
-            return false;
+        
+        // check if we have cached this decision
+        if (objectId == cachedObjId) {
+            return chachedIsFullyIndexed;
        }
        
-        // ELTODO get total number of chunks in the file, and verify that
-        // all of the chunks have been indexed.
+        cachedObjId = objectId;
+        final Server solrServer = KeywordSearch.getServer();
+        if (solrServer.coreIsOpen() == false) {
+            chachedIsFullyIndexed = false;
+            return chachedIsFullyIndexed;
+        }
+
+        // verify that all of the chunks in the file have been indexed.
        try {
-            return solrServer.queryIsIndexed(objectId);
+            chachedIsFullyIndexed = solrServer.queryIsFullyIndexed(objectId);
+            return chachedIsFullyIndexed;
        } catch (NoOpenCoreException | KeywordSearchModuleException ex) {
            logger.log(Level.SEVERE, "Error querying Solr server", ex); //NON-NLS
-            return false;
+            chachedIsFullyIndexed = false;
+            return chachedIsFullyIndexed;
        }
    }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
@ -1635,23 +1635,29 @@ public class Server {
    }

    /**
-     * Return true if the file is indexed (either as a whole as a chunk)
+     * Return true if the file is fully indexed (no chunks are missing)
     *
     * @param contentID
     *
-     * @return true if it is indexed
+     * @return true if it is fully indexed
     *
     * @throws KeywordSearchModuleException
     * @throws NoOpenCoreException
     */
-    public boolean queryIsIndexed(long contentID) throws KeywordSearchModuleException, NoOpenCoreException {
+    public boolean queryIsFullyIndexed(long contentID) throws KeywordSearchModuleException, NoOpenCoreException {
        currentCoreLock.readLock().lock();
        try {
            if (null == currentCollection) {
                throw new NoOpenCoreException();
            }
            try {
-                return currentCollection.queryIsIndexed(contentID);
+                int totalNumChunks = currentCollection.queryTotalNumFileChunks(contentID);
+                if (totalNumChunks == 0) {
+                    return false;
+                }
+
+                int numIndexedChunks = currentCollection.queryNumIndexedChunks(contentID);
+                return numIndexedChunks == totalNumChunks;
            } catch (Exception ex) {
                // intentional "catch all" as Solr is known to throw all kinds of Runtime exceptions
                throw new KeywordSearchModuleException(NbBundle.getMessage(this.getClass(), "Server.queryIsIdxd.exception.msg"), ex);
@ -1680,7 +1686,7 @@ public class Server {
                throw new NoOpenCoreException();
            }
            try {
-                return currentCollection.queryNumFileChunks(fileID);
+                return currentCollection.queryTotalNumFileChunks(fileID);
            } catch (Exception ex) {
                // intentional "catch all" as Solr is known to throw all kinds of Runtime exceptions
                throw new KeywordSearchModuleException(NbBundle.getMessage(this.getClass(), "Server.queryNumFileChunks.exception.msg"), ex);
@ -2484,7 +2490,7 @@ public class Server {
        }

        /**
-         * Return true if the file is indexed (either as a whole as a chunk)
+         * Return true if the file is indexed (either as a whole or as a chunk)
         *
         * @param contentID
         *
@ -2502,17 +2508,20 @@ public class Server {
        }

        /**
-         * Execute query that gets number of indexed file chunks for a file
+         * Execute query that gets total number of file chunks for a file. NOTE:
+         * this does not imply that all of the chunks have been indexed. This
+         * parameter simply stores the total number of chunks that the file had
+         * (as determined during chunking).
         *
         * @param contentID file id of the original file broken into chunks and
         *                  indexed
         *
-         * @return int representing number of indexed file chunks, 0 if there is
-         * no chunks
+         * @return int representing number of file chunks, 0 if there is no
+         *         chunks
         *
         * @throws SolrServerException
         */
-        private int queryNumFileChunks(long contentID) throws SolrServerException, IOException {
+        private int queryTotalNumFileChunks(long contentID) throws SolrServerException, IOException {
            final SolrQuery q = new SolrQuery();
            q.setQuery("*:*");
            String filterQuery = Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(Long.toString(contentID));
@ -2537,6 +2546,24 @@ public class Server {
            logger.log(Level.SEVERE, "Error getting content from Solr. Solr document id " + contentID + ", query: " + filterQuery); //NON-NLS
            return 0;
        }
+        
+        /**
+         * Execute query that gets number of indexed chunks for a specific Solr
+         * document, without actually returning the content.
+         *
+         * @param contentID file id of the original file broken into chunks and
+         *                  indexed
+         *
+         * @return int representing number of indexed chunks
+         *
+         * @throws SolrServerException
+         */
+        int queryNumIndexedChunks(long contentID) throws SolrServerException, IOException {
+            SolrQuery q = new SolrQuery(Server.Schema.ID + ":" + contentID + Server.CHUNK_ID_SEPARATOR + "*");
+            q.setRows(0);
+            int numChunks = (int) query(q).getResults().getNumFound();
+            return numChunks;
+        }
    }

    class ServerAction extends AbstractAction {