From 18ca03e8d1ab57a7d665a0844fcde38e7c1b0813 Mon Sep 17 00:00:00 2001 From: apriestman Date: Wed, 7 Oct 2020 09:11:20 -0400 Subject: [PATCH] Extract inline images --- .../embeddedfileextractor/PDFAttachmentExtractor.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java index a36b5c365d..154d679596 100755 --- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java @@ -34,6 +34,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -76,9 +77,17 @@ final class PDFAttachmentExtractor { ExtractionPreconditions.checkArgument(Files.exists(outputDir), String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS + + ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); + PDFParserConfig pdfConfig = new PDFParserConfig(); + pdfConfig.setExtractInlineImages(true); + pdfConfig.setExtractUniqueInlineImagesOnly(true); + + parseContext.set(PDFParserConfig.class, pdfConfig); + //Keep track of the attachment files as they are being extracted and written to disk. NewResourceWatcher watcher = new NewResourceWatcher(); parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));