Extract inline images from PDFs

2025-07-12 07:56:16 +00:00 · 2020-10-09 16:47:48 -04:00 · 2020-10-09 16:47:48 -04:00 · 89fcf04cb7
commit 89fcf04cb7
parent 2d239849b1
1 changed files with 7 additions and 0 deletions
--- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java
@ -34,6 +34,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@ -79,6 +80,12 @@ final class PDFAttachmentExtractor {
        ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, parser);

+        PDFParserConfig pdfConfig = new PDFParserConfig();
+        pdfConfig.setExtractInlineImages(true);
+        pdfConfig.setExtractUniqueInlineImagesOnly(true);
+
+        parseContext.set(PDFParserConfig.class, pdfConfig);
+        
        //Keep track of the attachment files as they are being extracted and written to disk.
        NewResourceWatcher watcher = new NewResourceWatcher();
        parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));