Merge pull request #6720 from wschaeferB/6737-EmbeddedFileExtractorCancellation

6737 embedded file extractor cancellation
2025-07-19 11:07:43 +00:00 · 2021-02-18 07:38:13 -05:00 · 2021-02-18 07:38:13 -05:00 · 8055f1c656
commit 8055f1c656
parent fd188daa5a ed0789bb4f
2 changed files with 112 additions and 18 deletions
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2015 Basis Technology Corp.
+ * Copyright 2015-2021 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -32,6 +32,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.logging.Level;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
@ -146,6 +147,9 @@ class DocumentEmbeddedContentExtractor {
    boolean isContentExtractionSupported(AbstractFile abstractFile) {
        String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
        for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
            if (checkForIngestCancellation(abstractFile)) {
                break;
            }
            if (s.toString().equals(abstractFileMimeType)) {
                abstractFileExtractionFormat = s;
                return true;
@ -154,6 +158,25 @@ class DocumentEmbeddedContentExtractor {
        return false;
    }
    /**
     * Private helper method to standardize the cancellation check that is
     * performed when running ingest. Will return false if the
     * DocumentEmbeddedContentExtractor is being used without an
     * IngestJobContext.
     *
     * @param file The file being extracted, this is only used for logging
     *             purposes.
     *
     * @return True if ingest has been cancelled, false otherwise. FFFF
     */
    private boolean checkForIngestCancellation(AbstractFile file) {
        if (fileTaskExecutor != null && context != null && context.fileIngestIsCancelled()) {
            LOGGER.log(Level.INFO, "Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}", new Object[]{file.getName(), file.getId()});
            return true;
        }
        return false;
    }
    /**
     * This method selects the appropriate process of extracting embedded
     * content from files using either Tika or POI classes. Once the content has
@ -189,7 +212,9 @@ class DocumentEmbeddedContentExtractor {
            LOGGER.log(Level.SEVERE, String.format("Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e); //NON-NLS
            return;
        }
-
+        if (checkForIngestCancellation(abstractFile)) {
            return;
        }
        // Call the appropriate extraction method based on mime type
        switch (abstractFileExtractionFormat) {
            case DOCX:
@ -219,6 +244,9 @@ class DocumentEmbeddedContentExtractor {
        // the common task of adding abstractFile to derivedfiles is performed.
        listOfExtractedImageAbstractFiles = new ArrayList<>();
        for (ExtractedFile extractedImage : listOfExtractedImages) {
            if (checkForIngestCancellation(abstractFile)) {
                return;
            }
            try {
                listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
                        extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
@ -258,11 +286,12 @@ class DocumentEmbeddedContentExtractor {
        officeParserConfig.setUseSAXPptxExtractor(true);
        officeParserConfig.setUseSAXDocxExtractor(true);
        parseContext.set(OfficeParserConfig.class, officeParserConfig);
        EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
        parseContext.set(EmbeddedDocumentExtractor.class, extractor);
        ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
-
+        if (checkForIngestCancellation(abstractFile)) {
            return null; //null will cause the calling method to return.
        }
        try {
            parser.parse(stream, contentHandler, metadata, parseContext);
        } catch (IOException | SAXException | TikaException ex) {
@ -322,6 +351,9 @@ class DocumentEmbeddedContentExtractor {
        byte[] data = null;
        int pictureNumber = 0; //added to ensure uniqueness in cases where suggestFullFileName returns duplicates
        for (Picture picture : listOfAllPictures) {
            if (checkForIngestCancellation(af)) {
                return null; //null will cause the calling method to return.
            }
            String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber + "." + picture.suggestFileExtension();
            try {
                data = picture.getContent();
@ -385,7 +417,9 @@ class DocumentEmbeddedContentExtractor {
        List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
        byte[] data = null;
        for (HSLFPictureData pictureData : listOfAllPictures) {
-
+            if (checkForIngestCancellation(af)) {
                return null; //null will cause the calling method to return.
            }
            // Get image extension, generate image name, write image to the module
            // output folder, add it to the listOfExtractedImageAbstractFiles
            PictureType type = pictureData.getType();
@ -475,6 +509,9 @@ class DocumentEmbeddedContentExtractor {
        List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
        byte[] data = null;
        for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
            if (checkForIngestCancellation(af)) {
                return null; //null will cause the calling method to return.
            }
            String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
            try {
                data = pictureData.getData();
@ -510,15 +547,17 @@ class DocumentEmbeddedContentExtractor {
            //Convert output to hook into the existing logic for creating derived files
            List<ExtractedFile> extractedFiles = new ArrayList<>();
-            extractedAttachments.entrySet().forEach((pathEntry) -> {
+            for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
                if (checkForIngestCancellation(abstractFile)) {
                    return null; //null will cause the calling method to return.
                }
                String fileName = pathEntry.getKey();
                Path writeLocation = pathEntry.getValue().getPath();
                int fileSize = pathEntry.getValue().getLength();
                extractedFiles.add(new ExtractedFile(fileName,
                        getFileRelativePath(writeLocation.getFileName().toString()),
                        fileSize));
-            });
+            }
            return extractedFiles;
        } catch (IOException | SAXException | TikaException | InvalidPathException ex) {
            LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() + " ID: " + abstractFile.getId(), ex); //NON-NLS         
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/SevenZipExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/SevenZipExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2015-2020 Basis Technology Corp.
+ * Copyright 2015-2021 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -143,7 +143,7 @@ class SevenZipExtractor {
    }
    /**
-     * Contructs an embedded file extractor that uses 7Zip via Java bindings to
+     * Constructs an embedded file extractor that uses 7Zip via Java bindings to
     * extract the contents of an archive file to a directory named for the
     * archive file.
     *
@ -184,6 +184,9 @@ class SevenZipExtractor {
    boolean isSevenZipExtractionSupported(AbstractFile file) {
        String fileMimeType = fileTypeDetector.getMIMEType(file);
        for (SupportedArchiveExtractionFormats mimeType : SupportedArchiveExtractionFormats.values()) {
            if (checkForIngestCancellation(file)) {
                break;
            }
            if (mimeType.toString().equals(fileMimeType)) {
                return true;
            }
@ -191,6 +194,24 @@ class SevenZipExtractor {
        return false;
    }
    /**
     * Private helper method to standardize the cancellation check that is
     * performed when running ingest. Will return false if the SevenZipExtractor
     * is being used without an IngestJobContext.
     *
     * @param file The file being extracted, this is only used for logging
     *             purposes.
     *
     * @return True if ingest has been cancelled, false otherwise.
     */
    private boolean checkForIngestCancellation(AbstractFile file) {
        if (fileTaskExecutor != null && context != null && context.fileIngestIsCancelled()) {
            logger.log(Level.INFO, "Ingest was cancelled. Results extracted from the following archive file may be incomplete. Name: {0}Object ID: {1}", new Object[]{file.getName(), file.getId()});
            return true;
        }
        return false;
    }
    /**
     * Check if the item inside archive is a potential zipbomb
     *
@ -567,7 +588,9 @@ class SevenZipExtractor {
            unpackSuccessful = false;
            return unpackSuccessful;
        }
-
+        if (checkForIngestCancellation(archiveFile)) {
            return false;
        }
        try {
            List<AbstractFile> existingFiles = getAlreadyExtractedFiles(archiveFile, archiveFilePath);
            for (AbstractFile file : existingFiles) {
@ -578,7 +601,9 @@ class SevenZipExtractor {
            unpackSuccessful = false;
            return unpackSuccessful;
        }
-
+        if (checkForIngestCancellation(archiveFile)) {
            return false;
        }
        parentAr = depthMap.get(archiveFile.getId());
        if (parentAr == null) {
            parentAr = new Archive(0, archiveFile.getId(), archiveFile);
@ -598,6 +623,9 @@ class SevenZipExtractor {
                return unpackSuccessful;
            }
        }
        if (checkForIngestCancellation(archiveFile)) {
            return false;
        }
        IInArchive inArchive = null;
        try {
            stream = new SevenZipContentReadStream(new ReadContentInputStream(archiveFile));
@ -605,6 +633,9 @@ class SevenZipExtractor {
            // it will be opened incorrectly when using 7zip's built-in auto-detect functionality.
            // All other archive formats are still opened using 7zip built-in auto-detect functionality.
            ArchiveFormat options = get7ZipOptions(archiveFile);
            if (checkForIngestCancellation(archiveFile)) {
                return false;
            }
            if (password == null) {
                inArchive = SevenZip.openInArchive(options, stream);
            } else {
@ -613,7 +644,9 @@ class SevenZipExtractor {
            numItems = inArchive.getNumberOfItems();
            progress.start(numItems);
            progressStarted = true;
-
+            if (checkForIngestCancellation(archiveFile)) {
                return false;
            }
            //setup the archive local root folder
            final String uniqueArchiveFileName = FileUtil.escapeFileName(EmbeddedFileExtractorIngestModule.getUniqueName(archiveFile));
            if (!makeExtractedFilesDirectory(uniqueArchiveFileName)) {
@ -634,6 +667,9 @@ class SevenZipExtractor {
            Map<Integer, InArchiveItemDetails> archiveDetailsMap = new HashMap<>();
            for (int inArchiveItemIndex = 0; inArchiveItemIndex < numItems; inArchiveItemIndex++) {
                if (checkForIngestCancellation(archiveFile)) {
                    return false;
                }
                progress.progress(String.format("%s: Analyzing archive metadata and creating local files (%d of %d)", currentArchiveName, inArchiveItemIndex + 1, numItems), 0);
                if (isZipBombArchiveItemCheck(archiveFile, inArchive, inArchiveItemIndex, depthMap, escapedArchiveFilePath)) {
                    unpackSuccessful = false;
@ -643,7 +679,9 @@ class SevenZipExtractor {
                String pathInArchive = getPathInArchive(inArchive, inArchiveItemIndex, archiveFile);
                byte[] pathBytesInArchive = getPathBytesInArchive(inArchive, inArchiveItemIndex, archiveFile);
                UnpackedTree.UnpackedNode unpackedNode = unpackedTree.addNode(pathInArchive, pathBytesInArchive);
-
+                if (checkForIngestCancellation(archiveFile)) {
                    return false;
                }
                final boolean isEncrypted = (Boolean) inArchive.getProperty(inArchiveItemIndex, PropID.ENCRYPTED);
                if (isEncrypted && password == null) {
@ -681,6 +719,9 @@ class SevenZipExtractor {
                        freeDiskSpace = newDiskSpace;
                    }
                }
                if (checkForIngestCancellation(archiveFile)) {
                    return false;
                }
                final String uniqueExtractedName = FileUtil.escapeFileName(uniqueArchiveFileName + File.separator + (inArchiveItemIndex / 1000) + File.separator + inArchiveItemIndex + "_" + new File(pathInArchive).getName());
                final String localAbsPath = moduleDirAbsolute + File.separator + uniqueExtractedName;
                final String localRelPath = moduleDirRelative + File.separator + uniqueExtractedName;
@ -699,7 +740,9 @@ class SevenZipExtractor {
                    localFileExists = false;
                    logger.log(Level.SEVERE, String.format("Error fiding or creating %s", localFile.getAbsolutePath()), ex); //NON-NLS
                }
-
+                if (checkForIngestCancellation(archiveFile)) {
                    return false;
                }
                // skip the rest of this loop if we couldn't create the file
                //continue will skip details from being added to the map
                if (!localFileExists) {
@ -716,7 +759,9 @@ class SevenZipExtractor {
            }
            int[] extractionIndices = getExtractableFilesFromDetailsMap(archiveDetailsMap);
-
+            if (checkForIngestCancellation(archiveFile)) {
                return false;
            }
            StandardIArchiveExtractCallback archiveCallBack
                    = new StandardIArchiveExtractCallback(
                            inArchive, archiveFile, progress,
@ -726,7 +771,9 @@ class SevenZipExtractor {
            //for efficiency. Hence, the HashMap and linear processing of 
            //inArchiveItemIndex. False indicates non-test mode
            inArchive.extract(extractionIndices, false, archiveCallBack);
-
+            if (checkForIngestCancellation(archiveFile)) {
                return false;
            }
            unpackSuccessful &= archiveCallBack.wasSuccessful();
            archiveDetailsMap = null;
@ -735,9 +782,15 @@ class SevenZipExtractor {
            // intermediate nodes since the order is not guaranteed
            try {
                unpackedTree.updateOrAddFileToCaseRec(statusMap, archiveFilePath);
                if (checkForIngestCancellation(archiveFile)) {
                    return false;
                }
                unpackedFiles = unpackedTree.getAllFileObjects();
                //check if children are archives, update archive depth tracking
                for (int i = 0; i < unpackedFiles.size(); i++) {
                    if (checkForIngestCancellation(archiveFile)) {
                        return false;
                    }
                    progress.progress(String.format("%s: Searching for nested archives (%d of %d)", currentArchiveName, i + 1, unpackedFiles.size()));
                    AbstractFile unpackedFile = unpackedFiles.get(i);
                    if (unpackedFile == null) {
@ -792,7 +845,9 @@ class SevenZipExtractor {
                progress.finish();
            }
        }
-
+        if (checkForIngestCancellation(archiveFile)) {
            return false;
        }
        //create artifact and send user message
        if (hasEncrypted) {
            String encryptionType = fullEncryption ? ENCRYPTION_FULL : ENCRYPTION_FILE_LEVEL;