First cut

2025-07-06 21:00:22 +00:00 · 2023-06-05 17:23:56 -04:00 · 2023-06-05 17:23:56 -04:00 · 0068d3acfd
commit 0068d3acfd
parent 99e08e8dbe
4 changed files with 393 additions and 27 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
@ -0,0 +1,270 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2023 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import com.google.common.io.CharSource;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.logging.Level;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.EscapeUtil;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.AbstractFile;
 /**
 * A "source" for the extracted abstractFile viewer that displays "raw" (not
 * highlighted) indexed text for a file or an artifact.
 */
 class ExtractedText implements IndexedText { // ELTODO
    private int numPages = 0;
    private int currentPage = 0;
    private final AbstractFile abstractFile;
    private final long objectId;
    //keep last abstractFile cached
    private String cachedString;
    private int cachedChunk;
    private Chunker chunker = null;
    private static final Logger logger = Logger.getLogger(ExtractedText.class.getName());
    /**
     * Construct a new ExtractedText object for the given content and object id.
     * This constructor needs both a content object and an object id because the
     * ExtractedText implementation attempts to provide useful messages in the
     * text content viewer for (a) the case where a file has not been indexed
     * because known files are being skipped and (b) the case where the file
     * content has not yet been indexed.
     *
     * @param file     Abstract file.
     * @param objectId Either a file id or an artifact id.
     */
    ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
        this.abstractFile = file;
        this.objectId = objectId;
        this.currentPage = 0; // ELTODO
        this.numPages = 1;
        initialize();
    }
    /**
     * Return the ID that this object is associated with -- to help with caching
     *
     * @return
     */
    public long getObjectId() {
        return this.objectId;
    }
    @Override
    public int getCurrentPage() {
        return this.currentPage;
    }
    @Override
    public boolean hasNextPage() {
        return true;
    }
    @Override
    public boolean hasPreviousPage() {
        return false;
    }
    @Override
    public int nextPage() {
        if (!hasNextPage()) {
            throw new IllegalStateException(
                    NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg"));
        }
        ++currentPage;
        return currentPage;
    }
    @Override
    public int previousPage() {
        if (!hasPreviousPage()) {
            throw new IllegalStateException(
                    NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg"));
        }
        --currentPage;
        return currentPage;
    }
    @Override
    public boolean hasNextItem() {
        throw new UnsupportedOperationException(
                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg"));
    }
    @Override
    public boolean hasPreviousItem() {
        throw new UnsupportedOperationException(
                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg"));
    }
    @Override
    public int nextItem() {
        throw new UnsupportedOperationException(
                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg"));
    }
    @Override
    public int previousItem() {
        throw new UnsupportedOperationException(
                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg"));
    }
    @Override
    public int currentItem() {
        throw new UnsupportedOperationException(
                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg"));
    }
    @Override
    public String getText() {
        try {
            return getContentText(currentPage + 1); // ELTODO
        } catch (Exception ex) {
            logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS
        }
        return Bundle.IndexedText_errorMessage_errorGettingText();
    }
    @NbBundle.Messages({
        "ExtractedText.FileText=File Text"})
    @Override
    public String toString() {
        return Bundle.ExtractedText_FileText();
    }
    @Override
    public boolean isSearchable() {
        return false;
    }
    @Override
    public String getAnchorPrefix() {
        return "";
    }
    @Override
    public int getNumberHits() {
        return 0;
    }
    @Override
    public int getNumberPages() {
        return numPages;
    }
    /**
     * Set the internal values, such as pages
     */
    private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
        TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
        Map<String, String> extractedMetadata = new HashMap<>();
        Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
        //Get a reader for the content of the given source
        BufferedReader reader = new BufferedReader(sourceReader);
        chunker = new Chunker(reader);
    }
    /**
     * Extract text from abstractFile
     *
     * @param node        a node that has extracted abstractFile
     * @param currentPage currently used page
     *
     * @return the extracted text
     */
    private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception {
        // ELTODO
        //check if cached
        if (cachedString != null) {
            if (cachedChunk == currentPage) {
                return cachedString;
            }
        }
        String indexedText;
        if (chunker.hasNext()) {
            Chunker.Chunk chunk = chunker.next();
            chunk.setChunkId(currentPage);
            if (chunker.hasException()) {
                logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
                throw chunker.getException();
            }
            indexedText = chunk.toString();
        } else {
            return Bundle.IndexedText_errorMessage_errorGettingText();
        }
        cachedString = EscapeUtil.escapeHtml(indexedText).trim();
        StringBuilder sb = new StringBuilder(cachedString.length() + 20);
        sb.append("<pre>").append(cachedString).append("</pre>"); //NON-NLS
        cachedString = sb.toString();
        cachedChunk = currentPage;
        return cachedString;
    }
    private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
            Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
        Reader fileText = extractor.getReader();
        Reader finalReader;
        try {
            Map<String, String> metadata = extractor.getMetadata();
            if (!metadata.isEmpty()) {
                // Creating the metadata artifact here causes occasional problems
                // when indexing the text, so we save the metadata map to 
                // use after this method is complete.
                extractedMetadata.putAll(metadata);
            }
            CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
            //Append the metadata to end of the file text
            finalReader = CharSource.concat(new CharSource() {
                //Wrap fileText reader for concatenation
                @Override
                public Reader openStream() throws IOException {
                    return fileText;
                }
            }, formattedMetadata).openStream();
        } catch (IOException ex) {
            logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
                    aFile.getName(), aFile.getId()), ex);
            //Just send file text.
            finalReader = fileText;
        }
        //divide into chunks and index
        return finalReader;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
@ -26,7 +26,9 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.logging.Level;
 import org.apache.tika.mime.MimeTypes;
 import org.openide.nodes.Node;
 import org.openide.util.Exceptions;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
@ -34,7 +36,11 @@ import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.ingest.IngestModule;
 import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.Account;
 import org.sleuthkit.datamodel.BlackboardArtifact;
@ -45,6 +51,7 @@ import static org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASS
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.Report;
 import org.sleuthkit.datamodel.TskCoreException;
 import org.sleuthkit.datamodel.TskData;
 /**
 * A text viewer that displays the indexed text associated with a file or an
@ -61,14 +68,20 @@ public class ExtractedTextViewer implements TextViewer {
    private ExtractedContentPanel panel;
    private volatile Node currentNode = null;
    private IndexedText currentSource = null;
    private FileTypeDetector fileTypeDetector = null;
    /**
     * Constructs a text viewer that displays the indexed text associated with a
     * file or an artifact, possibly marked up with HTML to highlight keyword
-     * hits.
+     * hits. If text for the Content has not been fully indexed by Solr then 
     * attempt to extract text using one of text extractors. 
     */
    public ExtractedTextViewer() {
-        // This constructor is intentionally empty.
+        try {
            fileTypeDetector = new FileTypeDetector();
        } catch (FileTypeDetector.FileTypeDetectorInitException ex) {
            logger.log(Level.SEVERE, "Failed to initialize FileTypeDetector", ex); //NON-NLS
        }
    }
    /**
@ -155,16 +168,31 @@ public class ExtractedTextViewer implements TextViewer {
         */
        IndexedText rawContentText = null;
        if (file != null) {
-            rawContentText = new RawText(file, file.getId());
+
            // see if Solr has fully indexed this file
            if (solrHasFullyIndexedContent(file.getId())) {
                rawContentText = new SolrIndexedText(file, file.getId());
                sources.add(rawContentText);
            }
            // Solr does not have fully indexed content. 
            // see if it's a file type for which we can extract text            
            if (ableToExtractTextFromFile(file)) {
                try {
                    rawContentText = new ExtractedText(file, file.getId());
                    sources.add(rawContentText);
                } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
                    // do nothing
                }              
            }
        }
        /*
         * Add the "raw" (not highlighted) text, if any, for any report
         * associated with the node.
         */
        if (report != null) {
-            rawContentText = new RawText(report, report.getId());
+            rawContentText = new SolrIndexedText(report, report.getId());
            sources.add(rawContentText);
        }
@ -222,12 +250,11 @@ public class ExtractedTextViewer implements TextViewer {
                if (attribute != null) {
                    long artifactId = attribute.getValueLong();
                    BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId);
-                    rawArtifactText = new RawText(associatedArtifact, associatedArtifact.getArtifactID());
+                    rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID());
                }
            } else {
-                rawArtifactText = new RawText(artifact, artifact.getArtifactID());
+                rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID());
            }
        }
        return rawArtifactText;
@ -340,10 +367,20 @@ public class ExtractedTextViewer implements TextViewer {
         * data source instead of a file.
         */
        AbstractFile file = node.getLookup().lookup(AbstractFile.class);
-        if (file != null && solrHasContent(file.getId())) {
+        if (file != null) {
            // see if Solr has fully indexed this file
            if (solrHasFullyIndexedContent(file.getId())) {
                return true;
            }
            // Solr does not have fully indexed content. 
            // see if it's a file type for which we can extract text            
            if (ableToExtractTextFromFile(file)) {
                return true;
            }
        }
        /*
         * If the lookup of the node contains an artifact that is neither a
         * keyword hit artifact nor a credit card account artifact, and the
@ -351,7 +388,7 @@ public class ExtractedTextViewer implements TextViewer {
         * indexed text for the artifact.
         */
        if (artifact != null) {
-            return solrHasContent(artifact.getArtifactID());
+            return solrHasFullyIndexedContent(artifact.getArtifactID());
        }
        /*
@ -361,7 +398,7 @@ public class ExtractedTextViewer implements TextViewer {
         */
        Report report = node.getLookup().lookup(Report.class);
        if (report != null) {
-            return solrHasContent(report.getId());
+            return solrHasFullyIndexedContent(report.getId());
        }
        /*
@ -397,12 +434,14 @@ public class ExtractedTextViewer implements TextViewer {
     *
     * @return true if Solr has content, else false
     */
-    private boolean solrHasContent(Long objectId) {
+    private boolean solrHasFullyIndexedContent(Long objectId) {
        final Server solrServer = KeywordSearch.getServer();
        if (solrServer.coreIsOpen() == false) {
            return false;
        }
        // ELTODO get total number of chunks in the file, and verify that
        // all of the chunks have been indexed.
        try {
            return solrServer.queryIsIndexed(objectId);
        } catch (NoOpenCoreException | KeywordSearchModuleException ex) {
@ -411,6 +450,63 @@ public class ExtractedTextViewer implements TextViewer {
        }
    }
    /**
     * Check if we can extract text for this file type.
     *
     * @param file Abstract File
     *
     * @return true if text can be extracted from file, else false
     */
    private boolean ableToExtractTextFromFile(AbstractFile file) {
        TskData.TSK_DB_FILES_TYPE_ENUM fileType = file.getType();
        if (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
            return false;
        }
        /**
         * Extract unicode strings from unallocated and unused blocks and carved
         * text files. The reason for performing string extraction on these is
         * because they all may contain multiple encodings which can cause text
         * to be missed by the more specialized text extractors.
         */
        if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
                || fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
                || (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) {
            return false;
        }
        final long size = file.getSize();
        //if not to index content, or a dir, or 0 content, index meta data only
        if (file.isDir() || size == 0) {
            return false;
        }
        // ELTODO do we need to skip text files here? probably not.
        if (file.getNameExtension().equalsIgnoreCase("txt")) {
            return false;
        }
        // ELTODO do we need to skip known files here? probably not.
        if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) {
            return false;
        }
        String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase();
        if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) {
            return false;
        }
        if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
            return false;
        }
        return true;
    }
    /**
     * Listener to select the next match found in the text
     */
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -96,7 +96,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     * generally text extractors should ignore archives and let unpacking
     * modules take care of them
     */
-    private static final List<String> ARCHIVE_MIME_TYPES
+    static final List<String> ARCHIVE_MIME_TYPES
            = ImmutableList.of(
                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
                    "application/x-7z-compressed", //NON-NLS
@ -683,7 +683,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
    @NbBundle.Messages({
        "KeywordSearchIngestModule.metadataTitle=METADATA"
    })
-    private CharSource getMetaDataCharSource(Map<String, String> metadata) {
+    static CharSource getMetaDataCharSource(Map<String, String> metadata) {
        return CharSource.wrap(new StringBuilder(
                String.format("\n\n------------------------------%s------------------------------\n\n",
                        Bundle.KeywordSearchIngestModule_metadataTitle()))
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2018 Basis Technology Corp.
+ * Copyright 2011-2023 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -30,9 +30,9 @@ import org.sleuthkit.datamodel.TskData;
 /**
 * A "source" for the extracted content viewer that displays "raw" (not
- * highlighted) indexed text for a file or an artifact.
+ * highlighted) Solr indexed text for a file or an artifact.
 */
-class RawText implements IndexedText {
+class SolrIndexedText implements IndexedText {
    private int numPages = 0;
    private int currentPage = 0;
@ -43,12 +43,12 @@ class RawText implements IndexedText {
    //keep last content cached
    private String cachedString;
    private int cachedChunk;
-    private static final Logger logger = Logger.getLogger(RawText.class.getName());
+    private static final Logger logger = Logger.getLogger(SolrIndexedText.class.getName());
    /**
-     * Construct a new RawText object for the given content and object id. This
+     * Construct a new SolrIndexedText object for the given content and object id. This
     * constructor needs both a content object and an object id because the
-     * RawText implementation attempts to provide useful messages in the text
+     * SolrIndexedText implementation attempts to provide useful messages in the text
     * content viewer for (a) the case where a file has not been indexed because
     * known files are being skipped and (b) the case where the file content has
     * not yet been indexed.
@ -56,14 +56,14 @@ class RawText implements IndexedText {
     * @param content  Used to get access to file names and "known" status.
     * @param objectId Either a file id or an artifact id.
     */
-    RawText(Content content, long objectId) {
+    SolrIndexedText(Content content, long objectId) {
        this.content = content;
        this.blackboardArtifact = null;
        this.objectId = objectId;
        initialize();
    }
-    RawText(BlackboardArtifact bba, long objectId) {
+    SolrIndexedText(BlackboardArtifact bba, long objectId) {
        this.content = null;
        this.blackboardArtifact = bba;
        this.objectId = objectId;
@ -159,14 +159,14 @@ class RawText implements IndexedText {
    }
    @NbBundle.Messages({
-        "RawText.FileText=File Text",
+        "SolrIndexedText.FileText=File Text",
-        "RawText.ResultText=Result Text"})
+        "SolrIndexedText.ResultText=Result Text"})
    @Override
    public String toString() {
        if (null != content) {
-            return Bundle.RawText_FileText();
+            return Bundle.SolrIndexedText_FileText();
        } else {
-            return Bundle.RawText_ResultText();
+            return Bundle.SolrIndexedText_ResultText();
        }
    }