Merge pull request #4758 from dannysmyda/4975-PDF-Attachment-Extractor

4975 pdf attachment extractor
2025-07-13 08:26:15 +00:00 · 2019-05-17 09:23:46 -04:00 · 2019-05-17 09:23:46 -04:00 · 0cff372c7d
commit 0cff372c7d
parent b096a58c61 dcce33aaeb
4 changed files with 253 additions and 16 deletions
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java
@ -22,8 +22,10 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
 import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.hslf.usermodel.HSLFPictureData;
 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.PicturesTable;
 import org.apache.poi.sl.usermodel.PictureData.PictureType;
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.util.RecordFormatException;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;
 /**
 * Extracts embedded content (e.g. images, audio, video) from Microsoft Office
- * documents (both original and OOXML forms).
+ * documents (both original and OOXML forms) and PDF documents.
 */
-class MSOfficeEmbeddedContentExtractor {
+class DocumentEmbeddedContentExtractor {
    private final FileManager fileManager;
    private final IngestServices services;
-    private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
+    private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
    private final IngestJobContext context;
    private String parentFileName;
    private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
        PPT("application/vnd.ms-powerpoint"), //NON-NLS
        PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
        XLS("application/vnd.ms-excel"), //NON-NLS
-        XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
+        XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
        PDF("application/pdf"); //NON-NLS
        private final String mimeType;
@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
    }
    private SupportedExtractionFormats abstractFileExtractionFormat;
-    MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
+    DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
        this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
        this.services = IngestServices.getInstance();
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
            case XLS:
                listOfExtractedImages = extractImagesFromXls(abstractFile);
                break;
            case PDF:
                listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
                break;
            default:
                break;
        }
@ -471,6 +475,38 @@ class MSOfficeEmbeddedContentExtractor {
    }
    /**
     * Extracts embedded attachments from PDF files.
     * 
     * @param abstractFile Input PDF file
     * @return List of extracted files to be made into derived file instances.
     */
    private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
        PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
        try {
            Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
            //Get map of attachment name -> location disk.
            Map<String, Path> extractedAttachments = pdfExtractor.extract(
                    new ReadContentInputStream(abstractFile), abstractFile.getId(),
                    outputDirectory);
            //Convert output to hook into the existing logic for creating derived files
            List<ExtractedFile> extractedFiles = new ArrayList<>();
            extractedAttachments.entrySet().forEach((pathEntry) -> {
                String fileName = pathEntry.getKey();
                Path writeLocation = pathEntry.getValue();
                extractedFiles.add(new ExtractedFile(fileName,
                        getFileRelativePath(writeLocation.getFileName().toString()), 
                        writeLocation.toFile().length()));
            });
            return extractedFiles;
        } catch (IOException | SAXException | TikaException ex) {
            LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
        }
        return Collections.emptyList();
    }
    /**
     * Writes image to the module output location.
     *
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java
@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
    //Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
    private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
    private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
-    private MSOfficeEmbeddedContentExtractor officeExtractor;
+    private DocumentEmbeddedContentExtractor documentExtractor;
    private SevenZipExtractor archiveExtractor;
    private FileTypeDetector fileTypeDetector;
    private long jobId;
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
        }
        /*
         * Construct an embedded content extractor for processing Microsoft
-         * Office documents.
+         * Office documents and PDF documents.
         */
        try {
-            this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
+            this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
        } catch (NoCurrentCaseException ex) {
            throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
        }
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
         */
        if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
            archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
-        } else if (officeExtractor.isContentExtractionSupported(abstractFile)) {
+        } else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
-            officeExtractor.extractEmbeddedContent(abstractFile);
+            documentExtractor.extractEmbeddedContent(abstractFile);
        }
        return ProcessResult.OK;
    }
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java
@ -0,0 +1,183 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.logging.Level;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.EncodedFileOutputStream;
 import org.sleuthkit.datamodel.TskData;
 /**
 * Facility for extracting and storing attachments from PDF documents.
 * Implementation specifics, however, are generic enough to be used on any
 * document with embedded resources. The current name reflects the only known
 * use case for this class.
 */
 final class PDFAttachmentExtractor {
    private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
    private final AutoDetectParser parser;
    public PDFAttachmentExtractor() {
        parser = new AutoDetectParser();
    }
    public PDFAttachmentExtractor(AutoDetectParser parser) {
        this.parser = parser;
    }
    /**
     * Extracts PDF attachments from a given input and writes them to the supplied
     * output directory.
     * 
     * @param input Input PDF to extract attachments from
     * @param parentID ID for unique extraction names
     * @param outputDir Directory to write attachments
     * @return Map containing file name -> location on disk
     * @throws IOException
     * @throws SAXException
     * @throws TikaException 
     */
    public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
        ExtractionPreconditions.checkArgument(Files.exists(outputDir), 
                String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
        ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, parser);
        //Keep track of the attachment files as they are being extracted and written to disk.
        NewResourceWatcher watcher = new NewResourceWatcher();
        parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
        //Parse input with default params, except for our ParseContext
        parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
        return watcher.getSnapshot();
    }
    /**
     * Internal Tika class that is invoked upon encountering an embedded
     * resource.
     */
    static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
        private final Path outputDirectory;
        private final NewResourceWatcher watcher;
        private final Long parentID;
        private Integer attachmentCount;
        public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
            this.outputDirectory = outputDirectory;
            this.watcher = watcher;
            this.parentID = parentID;
            attachmentCount = 0;
        }
        @Override
        public boolean shouldParseEmbedded(Metadata mtdt) {
            //Grab every available attachment
            return true;
        }
        @Override
        public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
            //Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
            String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
            String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
            String ext = FilenameUtils.getExtension(name);
            //Append the extension if we can.
            if(ext == null) {
                name = uniqueExtractedName;
            } else if(!ext.isEmpty()) {
                uniqueExtractedName += "." + ext;
            }
            Path outputFile = outputDirectory.resolve(uniqueExtractedName);
            try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
                    new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
                IOUtils.copy(in, outputStream);
                watcher.notify(name, outputFile);
            } catch (IOException ex) {
                logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
                        uniqueExtractedName, outputFile), ex);
            }
        }
    }
    /**
     * Convenient wrapper for keeping track of new resource paths and the display
     * name for each of these resources.
     *
     * It is necessary to maintain a snapshot of only our changes when the
     * output directory is shared among other processes/threads.
     */
    static class NewResourceWatcher {
        private final Map<String, Path> newResourcePaths;
        public NewResourceWatcher() {
            newResourcePaths = new HashMap<>();
        }
        public void notify(String name, Path newResource) {
            newResourcePaths.put(name, newResource);
        }
        public Map<String, Path> getSnapshot() {
            return newResourcePaths;
        }
    }
    /**
     * Static convenience methods that ensure the PDF extractor is being invoked
     * correctly.
     */
    static class ExtractionPreconditions {
        public static void checkArgument(boolean expression, String msg) throws IOException {
            if (!expression) {
                throw new IOException(msg);
            }
        }
        private ExtractionPreconditions(){
        }
    }
 }
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@ -46,6 +46,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParsingReader;
@ -125,6 +126,16 @@ final class TikaTextExtractor implements TextExtractor {
                    "application/x-z", //NON-NLS
                    "application/x-compress"); //NON-NLS
    //Tika should ignore types with embedded files that can be handled by the unpacking modules
    private static final List<String> EMBEDDED_FILE_MIME_TYPES
            = ImmutableList.of("application/msword", //NON-NLS
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
                    "application/vnd.ms-powerpoint", //NON-NLS
                    "application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
                    "application/vnd.ms-excel", //NON-NLS
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
                    "application/pdf"); //NON-NLS
    private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
    private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
@ -184,7 +195,14 @@ final class TikaTextExtractor implements TextExtractor {
        InputStream stream = null;
        ParseContext parseContext = new ParseContext();
-        parseContext.set(Parser.class, parser);
+
        //Disable appending embedded file text to output for EFE supported types
        //JIRA-4975
        if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
            parseContext.set(Parser.class, new EmptyParser());
        } else {
            parseContext.set(Parser.class, parser);
        }
        if (ocrEnabled() && content instanceof AbstractFile) {
            AbstractFile file = ((AbstractFile) content);
@ -516,11 +534,11 @@ final class TikaTextExtractor implements TextExtractor {
        if (context != null) {
            ImageConfig configInstance = context.lookup(ImageConfig.class);
            if (configInstance != null) {
-                if(Objects.nonNull(configInstance.getOCREnabled())) {
+                if (Objects.nonNull(configInstance.getOCREnabled())) {
                    this.tesseractOCREnabled = configInstance.getOCREnabled();
                }
-                if(Objects.nonNull(configInstance.getOCRLanguages())) {
+                if (Objects.nonNull(configInstance.getOCRLanguages())) {
                    this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
                }
            }