Merge pull request #4758 from dannysmyda/4975-PDF-Attachment-Extractor

4975 pdf attachment extractor
This commit is contained in:
Richard Cordovano 2019-05-17 09:23:46 -04:00 committed by GitHub
commit 0cff372c7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 253 additions and 16 deletions

View File

@ -22,8 +22,10 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hslf.usermodel.HSLFPictureData;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.sl.usermodel.PictureData.PictureType;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.util.RecordFormatException;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;
/**
* Extracts embedded content (e.g. images, audio, video) from Microsoft Office
* documents (both original and OOXML forms).
* documents (both original and OOXML forms) and PDF documents.
*/
class MSOfficeEmbeddedContentExtractor {
class DocumentEmbeddedContentExtractor {
private final FileManager fileManager;
private final IngestServices services;
private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
private final IngestJobContext context;
private String parentFileName;
private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
PPT("application/vnd.ms-powerpoint"), //NON-NLS
PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
XLS("application/vnd.ms-excel"), //NON-NLS
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
PDF("application/pdf"); //NON-NLS
private final String mimeType;
@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
}
private SupportedExtractionFormats abstractFileExtractionFormat;
MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
this.services = IngestServices.getInstance();
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
case XLS:
listOfExtractedImages = extractImagesFromXls(abstractFile);
break;
case PDF:
listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
break;
default:
break;
}
@ -470,6 +474,38 @@ class MSOfficeEmbeddedContentExtractor {
return listOfExtractedImages;
}
/**
* Extracts embedded attachments from PDF files.
*
* @param abstractFile Input PDF file
* @return List of extracted files to be made into derived file instances.
*/
private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
try {
Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
//Get map of attachment name -> location disk.
Map<String, Path> extractedAttachments = pdfExtractor.extract(
new ReadContentInputStream(abstractFile), abstractFile.getId(),
outputDirectory);
//Convert output to hook into the existing logic for creating derived files
List<ExtractedFile> extractedFiles = new ArrayList<>();
extractedAttachments.entrySet().forEach((pathEntry) -> {
String fileName = pathEntry.getKey();
Path writeLocation = pathEntry.getValue();
extractedFiles.add(new ExtractedFile(fileName,
getFileRelativePath(writeLocation.getFileName().toString()),
writeLocation.toFile().length()));
});
return extractedFiles;
} catch (IOException | SAXException | TikaException ex) {
LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
}
return Collections.emptyList();
}
/**
* Writes image to the module output location.

View File

@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
//Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
private MSOfficeEmbeddedContentExtractor officeExtractor;
private DocumentEmbeddedContentExtractor documentExtractor;
private SevenZipExtractor archiveExtractor;
private FileTypeDetector fileTypeDetector;
private long jobId;
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
}
/*
* Construct an embedded content extractor for processing Microsoft
* Office documents.
* Office documents and PDF documents.
*/
try {
this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
} catch (NoCurrentCaseException ex) {
throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
}
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
*/
if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
} else if (officeExtractor.isContentExtractionSupported(abstractFile)) {
officeExtractor.extractEmbeddedContent(abstractFile);
} else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
documentExtractor.extractEmbeddedContent(abstractFile);
}
return ProcessResult.OK;
}

View File

@ -0,0 +1,183 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2019 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.modules.embeddedfileextractor;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.EncodedFileOutputStream;
import org.sleuthkit.datamodel.TskData;
/**
* Facility for extracting and storing attachments from PDF documents.
* Implementation specifics, however, are generic enough to be used on any
* document with embedded resources. The current name reflects the only known
* use case for this class.
*/
final class PDFAttachmentExtractor {
private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
private final AutoDetectParser parser;
public PDFAttachmentExtractor() {
parser = new AutoDetectParser();
}
public PDFAttachmentExtractor(AutoDetectParser parser) {
this.parser = parser;
}
/**
* Extracts PDF attachments from a given input and writes them to the supplied
* output directory.
*
* @param input Input PDF to extract attachments from
* @param parentID ID for unique extraction names
* @param outputDir Directory to write attachments
* @return Map containing file name -> location on disk
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
ExtractionPreconditions.checkArgument(Files.exists(outputDir),
String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
//Keep track of the attachment files as they are being extracted and written to disk.
NewResourceWatcher watcher = new NewResourceWatcher();
parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
//Parse input with default params, except for our ParseContext
parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
return watcher.getSnapshot();
}
/**
* Internal Tika class that is invoked upon encountering an embedded
* resource.
*/
static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
private final Path outputDirectory;
private final NewResourceWatcher watcher;
private final Long parentID;
private Integer attachmentCount;
public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
this.outputDirectory = outputDirectory;
this.watcher = watcher;
this.parentID = parentID;
attachmentCount = 0;
}
@Override
public boolean shouldParseEmbedded(Metadata mtdt) {
//Grab every available attachment
return true;
}
@Override
public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
//Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
String ext = FilenameUtils.getExtension(name);
//Append the extension if we can.
if(ext == null) {
name = uniqueExtractedName;
} else if(!ext.isEmpty()) {
uniqueExtractedName += "." + ext;
}
Path outputFile = outputDirectory.resolve(uniqueExtractedName);
try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
IOUtils.copy(in, outputStream);
watcher.notify(name, outputFile);
} catch (IOException ex) {
logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
uniqueExtractedName, outputFile), ex);
}
}
}
/**
* Convenient wrapper for keeping track of new resource paths and the display
* name for each of these resources.
*
* It is necessary to maintain a snapshot of only our changes when the
* output directory is shared among other processes/threads.
*/
static class NewResourceWatcher {
private final Map<String, Path> newResourcePaths;
public NewResourceWatcher() {
newResourcePaths = new HashMap<>();
}
public void notify(String name, Path newResource) {
newResourcePaths.put(name, newResource);
}
public Map<String, Path> getSnapshot() {
return newResourcePaths;
}
}
/**
* Static convenience methods that ensure the PDF extractor is being invoked
* correctly.
*/
static class ExtractionPreconditions {
public static void checkArgument(boolean expression, String msg) throws IOException {
if (!expression) {
throw new IOException(msg);
}
}
private ExtractionPreconditions(){
}
}
}

View File

@ -46,6 +46,7 @@ import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader;
@ -125,6 +126,16 @@ final class TikaTextExtractor implements TextExtractor {
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
//Tika should ignore types with embedded files that can be handled by the unpacking modules
private static final List<String> EMBEDDED_FILE_MIME_TYPES
= ImmutableList.of("application/msword", //NON-NLS
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
"application/vnd.ms-powerpoint", //NON-NLS
"application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
"application/vnd.ms-excel", //NON-NLS
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
"application/pdf"); //NON-NLS
private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
@ -184,7 +195,14 @@ final class TikaTextExtractor implements TextExtractor {
InputStream stream = null;
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
//Disable appending embedded file text to output for EFE supported types
//JIRA-4975
if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
parseContext.set(Parser.class, new EmptyParser());
} else {
parseContext.set(Parser.class, parser);
}
if (ocrEnabled() && content instanceof AbstractFile) {
AbstractFile file = ((AbstractFile) content);
@ -516,11 +534,11 @@ final class TikaTextExtractor implements TextExtractor {
if (context != null) {
ImageConfig configInstance = context.lookup(ImageConfig.class);
if (configInstance != null) {
if(Objects.nonNull(configInstance.getOCREnabled())) {
if (Objects.nonNull(configInstance.getOCREnabled())) {
this.tesseractOCREnabled = configInstance.getOCREnabled();
}
if(Objects.nonNull(configInstance.getOCRLanguages())) {
if (Objects.nonNull(configInstance.getOCRLanguages())) {
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
}
}