Merge pull request #4758 from dannysmyda/4975-PDF-Attachment-Extractor

4975 pdf attachment extractor
This commit is contained in:
Richard Cordovano 2019-05-17 09:23:46 -04:00 committed by GitHub
commit 0cff372c7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 253 additions and 16 deletions

View File

@ -22,8 +22,10 @@ import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hslf.usermodel.HSLFPictureData; import org.apache.poi.hslf.usermodel.HSLFPictureData;
import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable; import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.sl.usermodel.PictureData.PictureType; import org.apache.poi.sl.usermodel.PictureData.PictureType;
import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.util.RecordFormatException;
import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector; import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;
/** /**
* Extracts embedded content (e.g. images, audio, video) from Microsoft Office * Extracts embedded content (e.g. images, audio, video) from Microsoft Office
* documents (both original and OOXML forms). * documents (both original and OOXML forms) and PDF documents.
*/ */
class MSOfficeEmbeddedContentExtractor { class DocumentEmbeddedContentExtractor {
private final FileManager fileManager; private final FileManager fileManager;
private final IngestServices services; private final IngestServices services;
private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName()); private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
private final IngestJobContext context; private final IngestJobContext context;
private String parentFileName; private String parentFileName;
private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
PPT("application/vnd.ms-powerpoint"), //NON-NLS PPT("application/vnd.ms-powerpoint"), //NON-NLS
PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
XLS("application/vnd.ms-excel"), //NON-NLS XLS("application/vnd.ms-excel"), //NON-NLS
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
PDF("application/pdf"); //NON-NLS
private final String mimeType; private final String mimeType;
@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
} }
private SupportedExtractionFormats abstractFileExtractionFormat; private SupportedExtractionFormats abstractFileExtractionFormat;
MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException { DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager(); this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
this.services = IngestServices.getInstance(); this.services = IngestServices.getInstance();
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
case XLS: case XLS:
listOfExtractedImages = extractImagesFromXls(abstractFile); listOfExtractedImages = extractImagesFromXls(abstractFile);
break; break;
case PDF:
listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
break;
default: default:
break; break;
} }
@ -471,6 +475,38 @@ class MSOfficeEmbeddedContentExtractor {
} }
/**
* Extracts embedded attachments from PDF files.
*
* @param abstractFile Input PDF file
* @return List of extracted files to be made into derived file instances.
*/
private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
try {
Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
//Get map of attachment name -> location disk.
Map<String, Path> extractedAttachments = pdfExtractor.extract(
new ReadContentInputStream(abstractFile), abstractFile.getId(),
outputDirectory);
//Convert output to hook into the existing logic for creating derived files
List<ExtractedFile> extractedFiles = new ArrayList<>();
extractedAttachments.entrySet().forEach((pathEntry) -> {
String fileName = pathEntry.getKey();
Path writeLocation = pathEntry.getValue();
extractedFiles.add(new ExtractedFile(fileName,
getFileRelativePath(writeLocation.getFileName().toString()),
writeLocation.toFile().length()));
});
return extractedFiles;
} catch (IOException | SAXException | TikaException ex) {
LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
}
return Collections.emptyList();
}
/** /**
* Writes image to the module output location. * Writes image to the module output location.
* *

View File

@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
//Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID //Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>(); private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter(); private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
private MSOfficeEmbeddedContentExtractor officeExtractor; private DocumentEmbeddedContentExtractor documentExtractor;
private SevenZipExtractor archiveExtractor; private SevenZipExtractor archiveExtractor;
private FileTypeDetector fileTypeDetector; private FileTypeDetector fileTypeDetector;
private long jobId; private long jobId;
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
} }
/* /*
* Construct an embedded content extractor for processing Microsoft * Construct an embedded content extractor for processing Microsoft
* Office documents. * Office documents and PDF documents.
*/ */
try { try {
this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute); this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
} catch (NoCurrentCaseException ex) { } catch (NoCurrentCaseException ex) {
throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex); throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
} }
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
*/ */
if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) { if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId)); archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
} else if (officeExtractor.isContentExtractionSupported(abstractFile)) { } else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
officeExtractor.extractEmbeddedContent(abstractFile); documentExtractor.extractEmbeddedContent(abstractFile);
} }
return ProcessResult.OK; return ProcessResult.OK;
} }

View File

@ -0,0 +1,183 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2019 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.modules.embeddedfileextractor;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.EncodedFileOutputStream;
import org.sleuthkit.datamodel.TskData;
/**
* Facility for extracting and storing attachments from PDF documents.
* Implementation specifics, however, are generic enough to be used on any
* document with embedded resources. The current name reflects the only known
* use case for this class.
*/
final class PDFAttachmentExtractor {
private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
private final AutoDetectParser parser;
public PDFAttachmentExtractor() {
parser = new AutoDetectParser();
}
public PDFAttachmentExtractor(AutoDetectParser parser) {
this.parser = parser;
}
/**
* Extracts PDF attachments from a given input and writes them to the supplied
* output directory.
*
* @param input Input PDF to extract attachments from
* @param parentID ID for unique extraction names
* @param outputDir Directory to write attachments
* @return Map containing file name -> location on disk
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
ExtractionPreconditions.checkArgument(Files.exists(outputDir),
String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
//Keep track of the attachment files as they are being extracted and written to disk.
NewResourceWatcher watcher = new NewResourceWatcher();
parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
//Parse input with default params, except for our ParseContext
parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
return watcher.getSnapshot();
}
/**
* Internal Tika class that is invoked upon encountering an embedded
* resource.
*/
static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
private final Path outputDirectory;
private final NewResourceWatcher watcher;
private final Long parentID;
private Integer attachmentCount;
public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
this.outputDirectory = outputDirectory;
this.watcher = watcher;
this.parentID = parentID;
attachmentCount = 0;
}
@Override
public boolean shouldParseEmbedded(Metadata mtdt) {
//Grab every available attachment
return true;
}
@Override
public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
//Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
String ext = FilenameUtils.getExtension(name);
//Append the extension if we can.
if(ext == null) {
name = uniqueExtractedName;
} else if(!ext.isEmpty()) {
uniqueExtractedName += "." + ext;
}
Path outputFile = outputDirectory.resolve(uniqueExtractedName);
try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
IOUtils.copy(in, outputStream);
watcher.notify(name, outputFile);
} catch (IOException ex) {
logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
uniqueExtractedName, outputFile), ex);
}
}
}
/**
* Convenient wrapper for keeping track of new resource paths and the display
* name for each of these resources.
*
* It is necessary to maintain a snapshot of only our changes when the
* output directory is shared among other processes/threads.
*/
static class NewResourceWatcher {
private final Map<String, Path> newResourcePaths;
public NewResourceWatcher() {
newResourcePaths = new HashMap<>();
}
public void notify(String name, Path newResource) {
newResourcePaths.put(name, newResource);
}
public Map<String, Path> getSnapshot() {
return newResourcePaths;
}
}
/**
* Static convenience methods that ensure the PDF extractor is being invoked
* correctly.
*/
static class ExtractionPreconditions {
public static void checkArgument(boolean expression, String msg) throws IOException {
if (!expression) {
throw new IOException(msg);
}
}
private ExtractionPreconditions(){
}
}
}

View File

@ -46,6 +46,7 @@ import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader; import org.apache.tika.parser.ParsingReader;
@ -125,6 +126,16 @@ final class TikaTextExtractor implements TextExtractor {
"application/x-z", //NON-NLS "application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS "application/x-compress"); //NON-NLS
//Tika should ignore types with embedded files that can be handled by the unpacking modules
private static final List<String> EMBEDDED_FILE_MIME_TYPES
= ImmutableList.of("application/msword", //NON-NLS
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
"application/vnd.ms-powerpoint", //NON-NLS
"application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
"application/vnd.ms-excel", //NON-NLS
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
"application/pdf"); //NON-NLS
private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName()); private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
@ -184,7 +195,14 @@ final class TikaTextExtractor implements TextExtractor {
InputStream stream = null; InputStream stream = null;
ParseContext parseContext = new ParseContext(); ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
//Disable appending embedded file text to output for EFE supported types
//JIRA-4975
if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
parseContext.set(Parser.class, new EmptyParser());
} else {
parseContext.set(Parser.class, parser);
}
if (ocrEnabled() && content instanceof AbstractFile) { if (ocrEnabled() && content instanceof AbstractFile) {
AbstractFile file = ((AbstractFile) content); AbstractFile file = ((AbstractFile) content);
@ -516,11 +534,11 @@ final class TikaTextExtractor implements TextExtractor {
if (context != null) { if (context != null) {
ImageConfig configInstance = context.lookup(ImageConfig.class); ImageConfig configInstance = context.lookup(ImageConfig.class);
if (configInstance != null) { if (configInstance != null) {
if(Objects.nonNull(configInstance.getOCREnabled())) { if (Objects.nonNull(configInstance.getOCREnabled())) {
this.tesseractOCREnabled = configInstance.getOCREnabled(); this.tesseractOCREnabled = configInstance.getOCREnabled();
} }
if(Objects.nonNull(configInstance.getOCRLanguages())) { if (Objects.nonNull(configInstance.getOCRLanguages())) {
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages()); this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
} }
} }