Merge pull request #4758 from dannysmyda/4975-PDF-Attachment-Extractor

4975 pdf attachment extractor
2025-07-12 16:06:15 +00:00 · 2019-05-17 09:23:46 -04:00 · 2019-05-17 09:23:46 -04:00 · 0cff372c7d
commit 0cff372c7d
parent b096a58c61 dcce33aaeb
4 changed files with 253 additions and 16 deletions
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java
@ -22,8 +22,10 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
 import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.hslf.usermodel.HSLFPictureData;
 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
-import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.PicturesTable;
 import org.apache.poi.sl.usermodel.PictureData.PictureType;
 import org.apache.poi.ss.usermodel.Workbook;
-import org.apache.poi.util.RecordFormatException;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;

 /**
 * Extracts embedded content (e.g. images, audio, video) from Microsoft Office
- * documents (both original and OOXML forms).
+ * documents (both original and OOXML forms) and PDF documents.
 */
-class MSOfficeEmbeddedContentExtractor {
+class DocumentEmbeddedContentExtractor {

    private final FileManager fileManager;
    private final IngestServices services;
-    private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
+    private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
    private final IngestJobContext context;
    private String parentFileName;
    private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
        PPT("application/vnd.ms-powerpoint"), //NON-NLS
        PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
        XLS("application/vnd.ms-excel"), //NON-NLS
-        XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
+        XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
+        PDF("application/pdf"); //NON-NLS

        private final String mimeType;

@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
    }
    private SupportedExtractionFormats abstractFileExtractionFormat;

-    MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
+    DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {

        this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
        this.services = IngestServices.getInstance();
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
            case XLS:
                listOfExtractedImages = extractImagesFromXls(abstractFile);
                break;
+            case PDF:
+                listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
+                break;
            default:
                break;
        }
@ -470,6 +474,38 @@ class MSOfficeEmbeddedContentExtractor {
        return listOfExtractedImages;

    }
+    
+    /**
+     * Extracts embedded attachments from PDF files.
+     * 
+     * @param abstractFile Input PDF file
+     * @return List of extracted files to be made into derived file instances.
+     */
+    private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
+        PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
+        try {
+            Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
+            //Get map of attachment name -> location disk.
+            Map<String, Path> extractedAttachments = pdfExtractor.extract(
+                    new ReadContentInputStream(abstractFile), abstractFile.getId(),
+                    outputDirectory);
+            
+            //Convert output to hook into the existing logic for creating derived files
+            List<ExtractedFile> extractedFiles = new ArrayList<>();
+            extractedAttachments.entrySet().forEach((pathEntry) -> {
+                String fileName = pathEntry.getKey();
+                Path writeLocation = pathEntry.getValue();
+                extractedFiles.add(new ExtractedFile(fileName,
+                        getFileRelativePath(writeLocation.getFileName().toString()), 
+                        writeLocation.toFile().length()));
+            });
+            
+            return extractedFiles;
+        } catch (IOException | SAXException | TikaException ex) {
+            LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
+        }
+        return Collections.emptyList();
+    }

    /**
     * Writes image to the module output location.
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java
@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
    //Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
    private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
    private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
-    private MSOfficeEmbeddedContentExtractor officeExtractor;
+    private DocumentEmbeddedContentExtractor documentExtractor;
    private SevenZipExtractor archiveExtractor;
    private FileTypeDetector fileTypeDetector;
    private long jobId;
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
        }
        /*
         * Construct an embedded content extractor for processing Microsoft
-         * Office documents.
+         * Office documents and PDF documents.
         */
        try {
-            this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
+            this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
        } catch (NoCurrentCaseException ex) {
            throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
        }
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
         */
        if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
            archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
-        } else if (officeExtractor.isContentExtractionSupported(abstractFile)) {
-            officeExtractor.extractEmbeddedContent(abstractFile);
+        } else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
+            documentExtractor.extractEmbeddedContent(abstractFile);
        }
        return ProcessResult.OK;
    }
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java
@ -0,0 +1,183 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2019 Basis Technology Corp.
+ * Contact: carrier <at> sleuthkit <dot> org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.modules.embeddedfileextractor;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Level;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.sleuthkit.autopsy.coreutils.Logger;
+import org.sleuthkit.datamodel.EncodedFileOutputStream;
+import org.sleuthkit.datamodel.TskData;
+
+/**
+ * Facility for extracting and storing attachments from PDF documents.
+ * Implementation specifics, however, are generic enough to be used on any
+ * document with embedded resources. The current name reflects the only known
+ * use case for this class.
+ */
+final class PDFAttachmentExtractor {
+
+    private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
+    private final AutoDetectParser parser;
+    
+    public PDFAttachmentExtractor() {
+        parser = new AutoDetectParser();
+    }
+    
+    public PDFAttachmentExtractor(AutoDetectParser parser) {
+        this.parser = parser;
+    }
+
+    /**
+     * Extracts PDF attachments from a given input and writes them to the supplied
+     * output directory.
+     * 
+     * @param input Input PDF to extract attachments from
+     * @param parentID ID for unique extraction names
+     * @param outputDir Directory to write attachments
+     * @return Map containing file name -> location on disk
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException 
+     */
+    public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
+        ExtractionPreconditions.checkArgument(Files.exists(outputDir), 
+                String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(Parser.class, parser);
+
+        //Keep track of the attachment files as they are being extracted and written to disk.
+        NewResourceWatcher watcher = new NewResourceWatcher();
+        parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
+
+        //Parse input with default params, except for our ParseContext
+        parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
+
+        return watcher.getSnapshot();
+    }
+
+    /**
+     * Internal Tika class that is invoked upon encountering an embedded
+     * resource.
+     */
+    static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
+
+        private final Path outputDirectory;
+        private final NewResourceWatcher watcher;
+        private final Long parentID;
+        private Integer attachmentCount;
+
+        public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
+            this.outputDirectory = outputDirectory;
+            this.watcher = watcher;
+            this.parentID = parentID;
+            attachmentCount = 0;
+        }
+
+        @Override
+        public boolean shouldParseEmbedded(Metadata mtdt) {
+            //Grab every available attachment
+            return true;
+        }
+
+        @Override
+        public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
+            //Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
+            String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
+            
+            String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
+            String ext = FilenameUtils.getExtension(name);
+            
+            //Append the extension if we can.
+            if(ext == null) {
+                name = uniqueExtractedName;
+            } else if(!ext.isEmpty()) {
+                uniqueExtractedName += "." + ext;
+            }
+            
+            Path outputFile = outputDirectory.resolve(uniqueExtractedName);
+
+            try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
+                    new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
+                IOUtils.copy(in, outputStream);
+                watcher.notify(name, outputFile);
+            } catch (IOException ex) {
+                logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
+                        uniqueExtractedName, outputFile), ex);
+            }
+        }
+    }
+
+    /**
+     * Convenient wrapper for keeping track of new resource paths and the display
+     * name for each of these resources.
+     *
+     * It is necessary to maintain a snapshot of only our changes when the
+     * output directory is shared among other processes/threads.
+     */
+    static class NewResourceWatcher {
+
+        private final Map<String, Path> newResourcePaths;
+
+        public NewResourceWatcher() {
+            newResourcePaths = new HashMap<>();
+        }
+
+        public void notify(String name, Path newResource) {
+            newResourcePaths.put(name, newResource);
+        }
+
+        public Map<String, Path> getSnapshot() {
+            return newResourcePaths;
+        }
+    }
+    
+    /**
+     * Static convenience methods that ensure the PDF extractor is being invoked
+     * correctly.
+     */
+    static class ExtractionPreconditions {
+
+        public static void checkArgument(boolean expression, String msg) throws IOException {
+            if (!expression) {
+                throw new IOException(msg);
+            }
+        }
+        
+        private ExtractionPreconditions(){
+        }
+    }
+}
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@ -46,6 +46,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParsingReader;
@ -125,6 +126,16 @@ final class TikaTextExtractor implements TextExtractor {
                    "application/x-z", //NON-NLS
                    "application/x-compress"); //NON-NLS

+    //Tika should ignore types with embedded files that can be handled by the unpacking modules
+    private static final List<String> EMBEDDED_FILE_MIME_TYPES
+            = ImmutableList.of("application/msword", //NON-NLS
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
+                    "application/vnd.ms-powerpoint", //NON-NLS
+                    "application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
+                    "application/vnd.ms-excel", //NON-NLS
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
+                    "application/pdf"); //NON-NLS
+
    private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
    private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());

@ -184,7 +195,14 @@ final class TikaTextExtractor implements TextExtractor {
        InputStream stream = null;

        ParseContext parseContext = new ParseContext();
-        parseContext.set(Parser.class, parser);
+
+        //Disable appending embedded file text to output for EFE supported types
+        //JIRA-4975
+        if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
+            parseContext.set(Parser.class, new EmptyParser());
+        } else {
+            parseContext.set(Parser.class, parser);
+        }

        if (ocrEnabled() && content instanceof AbstractFile) {
            AbstractFile file = ((AbstractFile) content);
@ -516,11 +534,11 @@ final class TikaTextExtractor implements TextExtractor {
        if (context != null) {
            ImageConfig configInstance = context.lookup(ImageConfig.class);
            if (configInstance != null) {
-                if(Objects.nonNull(configInstance.getOCREnabled())) {
+                if (Objects.nonNull(configInstance.getOCREnabled())) {
                    this.tesseractOCREnabled = configInstance.getOCREnabled();
                }
-                
-                if(Objects.nonNull(configInstance.getOCRLanguages())) {
+
+                if (Objects.nonNull(configInstance.getOCRLanguages())) {
                    this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
                }
            }