More work

2025-07-06 21:00:22 +00:00 · 2023-06-06 14:14:37 -04:00 · 2023-06-06 14:14:37 -04:00 · e3da0cae14
commit e3da0cae14
parent 8494453a09
4 changed files with 22 additions and 67 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2021 Basis Technology Corp.
+ * Copyright 2011-2023 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -659,14 +659,8 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP
        int totalPages = source.getNumberPages();
        updateTotalPagesDisplay(totalPages);

-        // ELTODO
-        //if (totalPages < 2) {
-        //    enableNextPageControl(false);
-        //    enablePrevPageControl(false);
-        //} else {
-            enableNextPageControl(source.hasNextPage());
-            enablePrevPageControl(source.hasPreviousPage());
-        //}
+        enableNextPageControl(source.hasNextPage());
+        enablePrevPageControl(source.hasPreviousPage());
    }

    /**
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
@ -32,46 +32,33 @@ import org.sleuthkit.autopsy.textextractors.TextExtractor;
 import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
 import org.sleuthkit.datamodel.AbstractFile;

-/** ELTODO
- * A "source" for the extracted abstractFile viewer that displays "raw" (not
- * highlighted) indexed text for a file or an artifact.
+/**
+ * A "source" for abstractFile viewer that displays "raw" extracted text for a
+ * file. Only supports file types for which there are text extractors. Uses
+ * chunking algorithm used by KeywordSearchIngestModule. The readers used in
+ * chunking don't have ability to go backwards or to fast forward to a specific
+ * offset. Therefore there is no way to scroll pages back, or to determine how
+ * many total pages there are.
 */
 class ExtractedText implements IndexedText {

    private int numPages = 0;
    private int currentPage = 0;
    private final AbstractFile abstractFile;
-    private final long objectId;
    private Chunker chunker = null;
    private static final Logger logger = Logger.getLogger(ExtractedText.class.getName());

    /**
-     * Construct a new ExtractedText object for the given content and object id.
-     * This constructor needs both a content object and an object id because the
-     * ExtractedText implementation attempts to provide useful messages in the
-     * text content viewer for (a) the case where a file has not been indexed
-     * because known files are being skipped and (b) the case where the file
-     * content has not yet been indexed.
+     * Construct a new ExtractedText object for the given abstract file.
     *
-     * @param file     Abstract file.
-     * @param objectId Either a file id or an artifact id.
+     * @param file Abstract file.
     */
-    ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
+    ExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
        this.abstractFile = file;
-        this.objectId = objectId;
        this.numPages = -1; // We don't know how many pages there are until we reach end of the document
        initialize();
    }

-    /**
-     * Return the ID that this object is associated with -- to help with caching
-     *
-     * @return
-     */
-    public long getObjectId() {
-        return this.objectId;
-    }
-
    @Override
    public int getCurrentPage() {
        return this.currentPage;
@ -177,9 +164,6 @@ class ExtractedText implements IndexedText {
        return numPages;
    }

-    /**
-     * Set the internal values, such as pages
-     */
    private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
        TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);

@ -194,7 +178,6 @@ class ExtractedText implements IndexedText {
    /**
     * Extract text from abstractFile
     *
-     * @param node        a node that has extracted abstractFile
     * @param currentPage currently used page
     *
     * @return the extracted text
@ -209,7 +192,7 @@ class ExtractedText implements IndexedText {
                logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
                throw chunker.getException();
            }
-            
+
            indexedText = chunk.toString();
        } else {
            return Bundle.IndexedText_errorMessage_errorGettingText();
@ -229,9 +212,7 @@ class ExtractedText implements IndexedText {
        try {
            Map<String, String> metadata = extractor.getMetadata();
            if (!metadata.isEmpty()) {
-                // Creating the metadata artifact here causes occasional problems
-                // when indexing the text, so we save the metadata map to 
-                // use after this method is complete.
+                // save the metadata map to use after this method is complete.
                extractedMetadata.putAll(metadata);
            }
            CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
@ -249,9 +230,8 @@ class ExtractedText implements IndexedText {
            //Just send file text.
            finalReader = fileText;
        }
-        //divide into chunks and index
+        //divide into chunks
        return finalReader;
-
    }

 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2019 Basis Technology Corp.
+ * Copyright 2011-2023 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -28,7 +28,6 @@ import java.util.List;
 import java.util.logging.Level;
 import org.apache.tika.mime.MimeTypes;
 import org.openide.nodes.Node;
-import org.openide.util.Exceptions;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.lookup.ServiceProvider;
@ -36,7 +35,6 @@ import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
 import org.sleuthkit.autopsy.coreutils.Logger;
-import org.sleuthkit.autopsy.ingest.IngestModule;
 import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
 import org.sleuthkit.autopsy.textextractors.TextExtractor;
@ -178,7 +176,7 @@ public class ExtractedTextViewer implements TextViewer {
                // see if it's a file type for which we can extract text            
                if (ableToExtractTextFromFile(file)) {
                    try {
-                        rawContentText = new ExtractedText(file, file.getId());
+                        rawContentText = new ExtractedText(file);
                        sources.add(rawContentText);
                    } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
                        // do nothing
@ -451,7 +449,9 @@ public class ExtractedTextViewer implements TextViewer {
    }

    /**
-     * Check if we can extract text for this file type.
+     * Check if we can extract text for this file type using one of our text extractors. 
+     * NOTE: the logic in this method should be similar and based on the 
+     * logic of how KeywordSearchIngestModule decides which files to index.
     *
     * @param file Abstract File
     *
@ -465,12 +465,6 @@ public class ExtractedTextViewer implements TextViewer {
            return false;
        }

-        /**
-         * Extract unicode strings from unallocated and unused blocks and carved
-         * text files. The reason for performing string extraction on these is
-         * because they all may contain multiple encodings which can cause text
-         * to be missed by the more specialized text extractors.
-         */
        if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
                || fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
                || (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) {
@ -478,22 +472,10 @@ public class ExtractedTextViewer implements TextViewer {
        }
        
        final long size = file.getSize();
-        //if not to index content, or a dir, or 0 content, index meta data only
-
        if (file.isDir() || size == 0) {
            return false;
        }
        
-        // ELTODO do we need to skip text files here? probably not.
-        if (file.getNameExtension().equalsIgnoreCase("txt")) {
-            return false;
-        }
-        
-        // ELTODO do we need to skip known files here? probably not.
-        if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) {
-            return false;
-        }
-        
        String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase();
        
        if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) {
@ -501,7 +483,7 @@ public class ExtractedTextViewer implements TextViewer {
        }
        
        if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
-            // ELTODO return false;
+            return false;
        }
        
        return true;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -38,7 +38,6 @@ import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
 import org.apache.tika.mime.MimeTypes;
-import org.openide.util.Exceptions;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;