diff --git a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/MSOfficeEmbeddedContentExtractor.java b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java similarity index 92% rename from Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/MSOfficeEmbeddedContentExtractor.java rename to Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java index d9c142563b..a362c5789f 100644 --- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/MSOfficeEmbeddedContentExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/DocumentEmbeddedContentExtractor.java @@ -22,8 +22,10 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hslf.usermodel.HSLFPictureData; import org.apache.poi.hslf.usermodel.HSLFSlideShow; -import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.model.PicturesTable; import org.apache.poi.sl.usermodel.PictureData.PictureType; import org.apache.poi.ss.usermodel.Workbook; -import org.apache.poi.util.RecordFormatException; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -72,13 +72,13 @@ import org.xml.sax.SAXException; /** * Extracts embedded content (e.g. images, audio, video) from Microsoft Office - * documents (both original and OOXML forms). + * documents (both original and OOXML forms) and PDF documents. */ -class MSOfficeEmbeddedContentExtractor { +class DocumentEmbeddedContentExtractor { private final FileManager fileManager; private final IngestServices services; - private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName()); + private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName()); private final IngestJobContext context; private String parentFileName; private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS @@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor { PPT("application/vnd.ms-powerpoint"), //NON-NLS PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS XLS("application/vnd.ms-excel"), //NON-NLS - XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS + XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS + PDF("application/pdf"); //NON-NLS private final String mimeType; @@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor { } private SupportedExtractionFormats abstractFileExtractionFormat; - MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException { + DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException { this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager(); this.services = IngestServices.getInstance(); @@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor { case XLS: listOfExtractedImages = extractImagesFromXls(abstractFile); break; + case PDF: + listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile); + break; default: break; } @@ -470,6 +474,38 @@ class MSOfficeEmbeddedContentExtractor { return listOfExtractedImages; } + + /** + * Extracts embedded attachments from PDF files. + * + * @param abstractFile Input PDF file + * @return List of extracted files to be made into derived file instances. + */ + private List extractEmbeddedContentFromPDF(AbstractFile abstractFile) { + PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser); + try { + Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName)); + //Get map of attachment name -> location disk. + Map extractedAttachments = pdfExtractor.extract( + new ReadContentInputStream(abstractFile), abstractFile.getId(), + outputDirectory); + + //Convert output to hook into the existing logic for creating derived files + List extractedFiles = new ArrayList<>(); + extractedAttachments.entrySet().forEach((pathEntry) -> { + String fileName = pathEntry.getKey(); + Path writeLocation = pathEntry.getValue(); + extractedFiles.add(new ExtractedFile(fileName, + getFileRelativePath(writeLocation.getFileName().toString()), + writeLocation.toFile().length())); + }); + + return extractedFiles; + } catch (IOException | SAXException | TikaException ex) { + LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS + } + return Collections.emptyList(); + } /** * Writes image to the module output location. diff --git a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java index 66c7f7030d..fd833b59a7 100644 --- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java +++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/EmbeddedFileExtractorIngestModule.java @@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda //Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID private static final ConcurrentHashMap> mapOfDepthTrees = new ConcurrentHashMap<>(); private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter(); - private MSOfficeEmbeddedContentExtractor officeExtractor; + private DocumentEmbeddedContentExtractor documentExtractor; private SevenZipExtractor archiveExtractor; private FileTypeDetector fileTypeDetector; private long jobId; @@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda } /* * Construct an embedded content extractor for processing Microsoft - * Office documents. + * Office documents and PDF documents. */ try { - this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute); + this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute); } catch (NoCurrentCaseException ex) { throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex); } @@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda */ if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) { archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId)); - } else if (officeExtractor.isContentExtractionSupported(abstractFile)) { - officeExtractor.extractEmbeddedContent(abstractFile); + } else if (documentExtractor.isContentExtractionSupported(abstractFile)) { + documentExtractor.extractEmbeddedContent(abstractFile); } return ProcessResult.OK; } diff --git a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java new file mode 100755 index 0000000000..a36b5c365d --- /dev/null +++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/PDFAttachmentExtractor.java @@ -0,0 +1,183 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.modules.embeddedfileextractor; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Level; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.datamodel.EncodedFileOutputStream; +import org.sleuthkit.datamodel.TskData; + +/** + * Facility for extracting and storing attachments from PDF documents. + * Implementation specifics, however, are generic enough to be used on any + * document with embedded resources. The current name reflects the only known + * use case for this class. + */ +final class PDFAttachmentExtractor { + + private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName()); + private final AutoDetectParser parser; + + public PDFAttachmentExtractor() { + parser = new AutoDetectParser(); + } + + public PDFAttachmentExtractor(AutoDetectParser parser) { + this.parser = parser; + } + + /** + * Extracts PDF attachments from a given input and writes them to the supplied + * output directory. + * + * @param input Input PDF to extract attachments from + * @param parentID ID for unique extraction names + * @param outputDir Directory to write attachments + * @return Map containing file name -> location on disk + * @throws IOException + * @throws SAXException + * @throws TikaException + */ + public Map extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException { + ExtractionPreconditions.checkArgument(Files.exists(outputDir), + String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS + + ParseContext parseContext = new ParseContext(); + parseContext.set(Parser.class, parser); + + //Keep track of the attachment files as they are being extracted and written to disk. + NewResourceWatcher watcher = new NewResourceWatcher(); + parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher)); + + //Parse input with default params, except for our ParseContext + parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext); + + return watcher.getSnapshot(); + } + + /** + * Internal Tika class that is invoked upon encountering an embedded + * resource. + */ + static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor { + + private final Path outputDirectory; + private final NewResourceWatcher watcher; + private final Long parentID; + private Integer attachmentCount; + + public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) { + this.outputDirectory = outputDirectory; + this.watcher = watcher; + this.parentID = parentID; + attachmentCount = 0; + } + + @Override + public boolean shouldParseEmbedded(Metadata mtdt) { + //Grab every available attachment + return true; + } + + @Override + public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException { + //Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness. + String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS + + String name = mtdt.get(Metadata.RESOURCE_NAME_KEY); + String ext = FilenameUtils.getExtension(name); + + //Append the extension if we can. + if(ext == null) { + name = uniqueExtractedName; + } else if(!ext.isEmpty()) { + uniqueExtractedName += "." + ext; + } + + Path outputFile = outputDirectory.resolve(uniqueExtractedName); + + try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream( + new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){ + IOUtils.copy(in, outputStream); + watcher.notify(name, outputFile); + } catch (IOException ex) { + logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS + uniqueExtractedName, outputFile), ex); + } + } + } + + /** + * Convenient wrapper for keeping track of new resource paths and the display + * name for each of these resources. + * + * It is necessary to maintain a snapshot of only our changes when the + * output directory is shared among other processes/threads. + */ + static class NewResourceWatcher { + + private final Map newResourcePaths; + + public NewResourceWatcher() { + newResourcePaths = new HashMap<>(); + } + + public void notify(String name, Path newResource) { + newResourcePaths.put(name, newResource); + } + + public Map getSnapshot() { + return newResourcePaths; + } + } + + /** + * Static convenience methods that ensure the PDF extractor is being invoked + * correctly. + */ + static class ExtractionPreconditions { + + public static void checkArgument(boolean expression, String msg) throws IOException { + if (!expression) { + throw new IOException(msg); + } + } + + private ExtractionPreconditions(){ + } + } +} diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index b23ff86442..2d923cc719 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -46,6 +46,7 @@ import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParsingReader; @@ -125,6 +126,16 @@ final class TikaTextExtractor implements TextExtractor { "application/x-z", //NON-NLS "application/x-compress"); //NON-NLS + //Tika should ignore types with embedded files that can be handled by the unpacking modules + private static final List EMBEDDED_FILE_MIME_TYPES + = ImmutableList.of("application/msword", //NON-NLS + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS + "application/vnd.ms-powerpoint", //NON-NLS + "application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS + "application/vnd.ms-excel", //NON-NLS + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS + "application/pdf"); //NON-NLS + private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName()); @@ -184,7 +195,14 @@ final class TikaTextExtractor implements TextExtractor { InputStream stream = null; ParseContext parseContext = new ParseContext(); - parseContext.set(Parser.class, parser); + + //Disable appending embedded file text to output for EFE supported types + //JIRA-4975 + if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) { + parseContext.set(Parser.class, new EmptyParser()); + } else { + parseContext.set(Parser.class, parser); + } if (ocrEnabled() && content instanceof AbstractFile) { AbstractFile file = ((AbstractFile) content); @@ -516,11 +534,11 @@ final class TikaTextExtractor implements TextExtractor { if (context != null) { ImageConfig configInstance = context.lookup(ImageConfig.class); if (configInstance != null) { - if(Objects.nonNull(configInstance.getOCREnabled())) { + if (Objects.nonNull(configInstance.getOCREnabled())) { this.tesseractOCREnabled = configInstance.getOCREnabled(); } - - if(Objects.nonNull(configInstance.getOCRLanguages())) { + + if (Objects.nonNull(configInstance.getOCRLanguages())) { this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages()); } }