From 0068d3acfdefb46793d3d5a5cffe2ac73e161c44 Mon Sep 17 00:00:00 2001
From: "eugene.livis" <elivis@basistech.com>
Date: Mon, 5 Jun 2023 17:23:56 -0400
Subject: [PATCH] First cut

---
 .../autopsy/keywordsearch/ExtractedText.java  | 270 ++++++++++++++++++
 .../keywordsearch/ExtractedTextViewer.java    | 122 +++++++-
 .../KeywordSearchIngestModule.java            |   4 +-
 .../{RawText.java => SolrIndexedText.java}    |  24 +-
 4 files changed, 393 insertions(+), 27 deletions(-)
 create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
 rename KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/{RawText.java => SolrIndexedText.java} (92%)
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
new file mode 100755
index 0000000000..c12b34e93b
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
@@ -0,0 +1,270 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2023 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import com.google.common.io.CharSource;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Level;
+import org.openide.util.NbBundle;
+import org.sleuthkit.autopsy.coreutils.EscapeUtil;
+import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
+import org.sleuthkit.datamodel.AbstractFile;
+
+/**
+ * A "source" for the extracted abstractFile viewer that displays "raw" (not
+ * highlighted) indexed text for a file or an artifact.
+ */
+class ExtractedText implements IndexedText { // ELTODO
+
+    private int numPages = 0;
+    private int currentPage = 0;
+    private final AbstractFile abstractFile;
+    private final long objectId;
+    //keep last abstractFile cached
+    private String cachedString;
+    private int cachedChunk;
+    private Chunker chunker = null;
+    private static final Logger logger = Logger.getLogger(ExtractedText.class.getName());
+
+    /**
+     * Construct a new ExtractedText object for the given content and object id.
+     * This constructor needs both a content object and an object id because the
+     * ExtractedText implementation attempts to provide useful messages in the
+     * text content viewer for (a) the case where a file has not been indexed
+     * because known files are being skipped and (b) the case where the file
+     * content has not yet been indexed.
+     *
+     * @param file     Abstract file.
+     * @param objectId Either a file id or an artifact id.
+     */
+    ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
+        this.abstractFile = file;
+        this.objectId = objectId;
+        this.currentPage = 0; // ELTODO
+        this.numPages = 1;
+        initialize();
+    }
+
+    /**
+     * Return the ID that this object is associated with -- to help with caching
+     *
+     * @return
+     */
+    public long getObjectId() {
+        return this.objectId;
+    }
+
+    @Override
+    public int getCurrentPage() {
+        return this.currentPage;
+    }
+
+    @Override
+    public boolean hasNextPage() {
+        return true;
+    }
+
+    @Override
+    public boolean hasPreviousPage() {
+        return false;
+    }
+
+    @Override
+    public int nextPage() {
+        if (!hasNextPage()) {
+            throw new IllegalStateException(
+                    NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg"));
+        }
+        ++currentPage;
+        return currentPage;
+    }
+
+    @Override
+    public int previousPage() {
+        if (!hasPreviousPage()) {
+            throw new IllegalStateException(
+                    NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg"));
+        }
+        --currentPage;
+        return currentPage;
+    }
+
+    @Override
+    public boolean hasNextItem() {
+        throw new UnsupportedOperationException(
+                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg"));
+    }
+
+    @Override
+    public boolean hasPreviousItem() {
+        throw new UnsupportedOperationException(
+                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg"));
+    }
+
+    @Override
+    public int nextItem() {
+        throw new UnsupportedOperationException(
+                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg"));
+    }
+
+    @Override
+    public int previousItem() {
+        throw new UnsupportedOperationException(
+                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg"));
+    }
+
+    @Override
+    public int currentItem() {
+        throw new UnsupportedOperationException(
+                NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg"));
+    }
+
+    @Override
+    public String getText() {
+        try {
+            return getContentText(currentPage + 1); // ELTODO
+        } catch (Exception ex) {
+            logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS
+        }
+        return Bundle.IndexedText_errorMessage_errorGettingText();
+    }
+
+    @NbBundle.Messages({
+        "ExtractedText.FileText=File Text"})
+    @Override
+    public String toString() {
+        return Bundle.ExtractedText_FileText();
+    }
+
+    @Override
+    public boolean isSearchable() {
+        return false;
+    }
+
+    @Override
+    public String getAnchorPrefix() {
+        return "";
+    }
+
+    @Override
+    public int getNumberHits() {
+        return 0;
+    }
+
+    @Override
+    public int getNumberPages() {
+        return numPages;
+    }
+
+    /**
+     * Set the internal values, such as pages
+     */
+    private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
+        TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
+
+        Map<String, String> extractedMetadata = new HashMap<>();
+        Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
+
+        //Get a reader for the content of the given source
+        BufferedReader reader = new BufferedReader(sourceReader);
+        chunker = new Chunker(reader);
+    }
+
+    /**
+     * Extract text from abstractFile
+     *
+     * @param node        a node that has extracted abstractFile
+     * @param currentPage currently used page
+     *
+     * @return the extracted text
+     */
+    private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception {
+
+        // ELTODO
+        //check if cached
+        if (cachedString != null) {
+            if (cachedChunk == currentPage) {
+                return cachedString;
+            }
+        }
+
+        String indexedText;
+        if (chunker.hasNext()) {
+            Chunker.Chunk chunk = chunker.next();
+            chunk.setChunkId(currentPage);
+
+            if (chunker.hasException()) {
+                logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
+                throw chunker.getException();
+            }
+            
+            indexedText = chunk.toString();
+        } else {
+            return Bundle.IndexedText_errorMessage_errorGettingText();
+        }
+
+        cachedString = EscapeUtil.escapeHtml(indexedText).trim();
+        StringBuilder sb = new StringBuilder(cachedString.length() + 20);
+        sb.append("<pre>").append(cachedString).append("</pre>"); //NON-NLS
+        cachedString = sb.toString();
+        cachedChunk = currentPage;
+
+        return cachedString;
+    }
+
+    private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
+            Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
+
+        Reader fileText = extractor.getReader();
+        Reader finalReader;
+        try {
+            Map<String, String> metadata = extractor.getMetadata();
+            if (!metadata.isEmpty()) {
+                // Creating the metadata artifact here causes occasional problems
+                // when indexing the text, so we save the metadata map to 
+                // use after this method is complete.
+                extractedMetadata.putAll(metadata);
+            }
+            CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
+            //Append the metadata to end of the file text
+            finalReader = CharSource.concat(new CharSource() {
+                //Wrap fileText reader for concatenation
+                @Override
+                public Reader openStream() throws IOException {
+                    return fileText;
+                }
+            }, formattedMetadata).openStream();
+        } catch (IOException ex) {
+            logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
+                    aFile.getName(), aFile.getId()), ex);
+            //Just send file text.
+            finalReader = fileText;
+        }
+        //divide into chunks and index
+        return finalReader;
+
+    }
+
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
index aac5757fc0..a33105e4fc 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
@@ -26,7 +26,9 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.logging.Level;
+import org.apache.tika.mime.MimeTypes;
 import org.openide.nodes.Node;
+import org.openide.util.Exceptions;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
@@ -34,7 +36,11 @@ import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
 import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.autopsy.ingest.IngestModule;
 import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
+import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
+import org.sleuthkit.autopsy.textextractors.TextExtractor;
+import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.Account;
 import org.sleuthkit.datamodel.BlackboardArtifact;
@@ -45,6 +51,7 @@ import static org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASS
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.Report;
 import org.sleuthkit.datamodel.TskCoreException;
+import org.sleuthkit.datamodel.TskData;
 
 /**
  * A text viewer that displays the indexed text associated with a file or an
@@ -61,14 +68,20 @@ public class ExtractedTextViewer implements TextViewer {
     private ExtractedContentPanel panel;
     private volatile Node currentNode = null;
     private IndexedText currentSource = null;
+    private FileTypeDetector fileTypeDetector = null;
 
     /**
      * Constructs a text viewer that displays the indexed text associated with a
      * file or an artifact, possibly marked up with HTML to highlight keyword
-     * hits.
+     * hits. If text for the Content has not been fully indexed by Solr then 
+     * attempt to extract text using one of text extractors. 
      */
     public ExtractedTextViewer() {
-        // This constructor is intentionally empty.
+        try {
+            fileTypeDetector = new FileTypeDetector();
+        } catch (FileTypeDetector.FileTypeDetectorInitException ex) {
+            logger.log(Level.SEVERE, "Failed to initialize FileTypeDetector", ex); //NON-NLS
+        }
     }
 
     /**
@@ -155,8 +168,23 @@ public class ExtractedTextViewer implements TextViewer {
          */
         IndexedText rawContentText = null;
         if (file != null) {
-            rawContentText = new RawText(file, file.getId());
-            sources.add(rawContentText);
+
+            // see if Solr has fully indexed this file
+            if (solrHasFullyIndexedContent(file.getId())) {
+                rawContentText = new SolrIndexedText(file, file.getId());
+                sources.add(rawContentText);
+            }
+
+            // Solr does not have fully indexed content. 
+            // see if it's a file type for which we can extract text            
+            if (ableToExtractTextFromFile(file)) {
+                try {
+                    rawContentText = new ExtractedText(file, file.getId());
+                    sources.add(rawContentText);
+                } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
+                    // do nothing
+                }              
+            }
         }
 
         /*
@@ -164,7 +192,7 @@ public class ExtractedTextViewer implements TextViewer {
          * associated with the node.
          */
         if (report != null) {
-            rawContentText = new RawText(report, report.getId());
+            rawContentText = new SolrIndexedText(report, report.getId());
             sources.add(rawContentText);
         }
 
@@ -222,12 +250,11 @@ public class ExtractedTextViewer implements TextViewer {
                 if (attribute != null) {
                     long artifactId = attribute.getValueLong();
                     BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId);
-                    rawArtifactText = new RawText(associatedArtifact, associatedArtifact.getArtifactID());
-
+                    rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID());
                 }
 
             } else {
-                rawArtifactText = new RawText(artifact, artifact.getArtifactID());
+                rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID());
             }
         }
         return rawArtifactText;
@@ -340,8 +367,18 @@ public class ExtractedTextViewer implements TextViewer {
          * data source instead of a file.
          */
         AbstractFile file = node.getLookup().lookup(AbstractFile.class);
-        if (file != null && solrHasContent(file.getId())) {
-            return true;
+        if (file != null) {
+            
+            // see if Solr has fully indexed this file
+            if (solrHasFullyIndexedContent(file.getId())) {
+                return true;
+            }
+
+            // Solr does not have fully indexed content. 
+            // see if it's a file type for which we can extract text            
+            if (ableToExtractTextFromFile(file)) {
+                return true;
+            }
         }
 
         /*
@@ -351,7 +388,7 @@ public class ExtractedTextViewer implements TextViewer {
          * indexed text for the artifact.
          */
         if (artifact != null) {
-            return solrHasContent(artifact.getArtifactID());
+            return solrHasFullyIndexedContent(artifact.getArtifactID());
         }
 
         /*
@@ -361,7 +398,7 @@ public class ExtractedTextViewer implements TextViewer {
          */
         Report report = node.getLookup().lookup(Report.class);
         if (report != null) {
-            return solrHasContent(report.getId());
+            return solrHasFullyIndexedContent(report.getId());
         }
 
         /*
@@ -397,12 +434,14 @@ public class ExtractedTextViewer implements TextViewer {
      *
      * @return true if Solr has content, else false
      */
-    private boolean solrHasContent(Long objectId) {
+    private boolean solrHasFullyIndexedContent(Long objectId) {
         final Server solrServer = KeywordSearch.getServer();
         if (solrServer.coreIsOpen() == false) {
             return false;
         }
 
+        // ELTODO get total number of chunks in the file, and verify that
+        // all of the chunks have been indexed.
         try {
             return solrServer.queryIsIndexed(objectId);
         } catch (NoOpenCoreException | KeywordSearchModuleException ex) {
@@ -411,6 +450,63 @@ public class ExtractedTextViewer implements TextViewer {
         }
     }
 
+    /**
+     * Check if we can extract text for this file type.
+     *
+     * @param file Abstract File
+     *
+     * @return true if text can be extracted from file, else false
+     */
+    private boolean ableToExtractTextFromFile(AbstractFile file) {
+
+        TskData.TSK_DB_FILES_TYPE_ENUM fileType = file.getType();
+        
+        if (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
+            return false;
+        }
+
+        /**
+         * Extract unicode strings from unallocated and unused blocks and carved
+         * text files. The reason for performing string extraction on these is
+         * because they all may contain multiple encodings which can cause text
+         * to be missed by the more specialized text extractors.
+         */
+        if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
+                || fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
+                || (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) {
+            return false;
+        }
+        
+        final long size = file.getSize();
+        //if not to index content, or a dir, or 0 content, index meta data only
+
+        if (file.isDir() || size == 0) {
+            return false;
+        }
+        
+        // ELTODO do we need to skip text files here? probably not.
+        if (file.getNameExtension().equalsIgnoreCase("txt")) {
+            return false;
+        }
+        
+        // ELTODO do we need to skip known files here? probably not.
+        if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) {
+            return false;
+        }
+        
+        String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase();
+        
+        if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) {
+            return false;
+        }
+        
+        if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
+            return false;
+        }
+        
+        return true;
+    }
+
     /**
      * Listener to select the next match found in the text
      */
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
index e3f9582fdf..782f966616 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@@ -96,7 +96,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
      * generally text extractors should ignore archives and let unpacking
      * modules take care of them
      */
-    private static final List<String> ARCHIVE_MIME_TYPES
+    static final List<String> ARCHIVE_MIME_TYPES
             = ImmutableList.of(
                     //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
                     "application/x-7z-compressed", //NON-NLS
@@ -683,7 +683,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
     @NbBundle.Messages({
         "KeywordSearchIngestModule.metadataTitle=METADATA"
     })
-    private CharSource getMetaDataCharSource(Map<String, String> metadata) {
+    static CharSource getMetaDataCharSource(Map<String, String> metadata) {
         return CharSource.wrap(new StringBuilder(
                 String.format("\n\n------------------------------%s------------------------------\n\n",
                         Bundle.KeywordSearchIngestModule_metadataTitle()))
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java
similarity index 92%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java
rename to KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java
index 789de3fd50..6745e0c5d7 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java
@@ -1,7 +1,7 @@
 /*
  * Autopsy Forensic Browser
  *
- * Copyright 2011-2018 Basis Technology Corp.
+ * Copyright 2011-2023 Basis Technology Corp.
  * Contact: carrier <at> sleuthkit <dot> org
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,9 +30,9 @@ import org.sleuthkit.datamodel.TskData;
 
 /**
  * A "source" for the extracted content viewer that displays "raw" (not
- * highlighted) indexed text for a file or an artifact.
+ * highlighted) Solr indexed text for a file or an artifact.
  */
-class RawText implements IndexedText {
+class SolrIndexedText implements IndexedText {
 
     private int numPages = 0;
     private int currentPage = 0;
@@ -43,12 +43,12 @@ class RawText implements IndexedText {
     //keep last content cached
     private String cachedString;
     private int cachedChunk;
-    private static final Logger logger = Logger.getLogger(RawText.class.getName());
+    private static final Logger logger = Logger.getLogger(SolrIndexedText.class.getName());
 
     /**
-     * Construct a new RawText object for the given content and object id. This
+     * Construct a new SolrIndexedText object for the given content and object id. This
      * constructor needs both a content object and an object id because the
-     * RawText implementation attempts to provide useful messages in the text
+     * SolrIndexedText implementation attempts to provide useful messages in the text
      * content viewer for (a) the case where a file has not been indexed because
      * known files are being skipped and (b) the case where the file content has
      * not yet been indexed.
@@ -56,14 +56,14 @@ class RawText implements IndexedText {
      * @param content  Used to get access to file names and "known" status.
      * @param objectId Either a file id or an artifact id.
      */
-    RawText(Content content, long objectId) {
+    SolrIndexedText(Content content, long objectId) {
         this.content = content;
         this.blackboardArtifact = null;
         this.objectId = objectId;
         initialize();
     }
 
-    RawText(BlackboardArtifact bba, long objectId) {
+    SolrIndexedText(BlackboardArtifact bba, long objectId) {
         this.content = null;
         this.blackboardArtifact = bba;
         this.objectId = objectId;
@@ -159,14 +159,14 @@ class RawText implements IndexedText {
     }
 
     @NbBundle.Messages({
-        "RawText.FileText=File Text",
-        "RawText.ResultText=Result Text"})
+        "SolrIndexedText.FileText=File Text",
+        "SolrIndexedText.ResultText=Result Text"})
     @Override
     public String toString() {
         if (null != content) {
-            return Bundle.RawText_FileText();
+            return Bundle.SolrIndexedText_FileText();
         } else {
-            return Bundle.RawText_ResultText();
+            return Bundle.SolrIndexedText_ResultText();
         }
     }