mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-12 16:06:15 +00:00
Merge pull request #4758 from dannysmyda/4975-PDF-Attachment-Extractor
4975 pdf attachment extractor
This commit is contained in:
commit
0cff372c7d
@ -22,8 +22,10 @@ import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
|
||||
import org.apache.poi.hwpf.usermodel.Picture;
|
||||
import org.apache.poi.hslf.usermodel.HSLFPictureData;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.model.PicturesTable;
|
||||
import org.apache.poi.sl.usermodel.PictureData.PictureType;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
import org.apache.poi.util.RecordFormatException;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.detect.Detector;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;
|
||||
|
||||
/**
|
||||
* Extracts embedded content (e.g. images, audio, video) from Microsoft Office
|
||||
* documents (both original and OOXML forms).
|
||||
* documents (both original and OOXML forms) and PDF documents.
|
||||
*/
|
||||
class MSOfficeEmbeddedContentExtractor {
|
||||
class DocumentEmbeddedContentExtractor {
|
||||
|
||||
private final FileManager fileManager;
|
||||
private final IngestServices services;
|
||||
private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
|
||||
private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
|
||||
private final IngestJobContext context;
|
||||
private String parentFileName;
|
||||
private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
|
||||
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
PPT("application/vnd.ms-powerpoint"), //NON-NLS
|
||||
PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
|
||||
XLS("application/vnd.ms-excel"), //NON-NLS
|
||||
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
|
||||
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
|
||||
PDF("application/pdf"); //NON-NLS
|
||||
|
||||
private final String mimeType;
|
||||
|
||||
@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
}
|
||||
private SupportedExtractionFormats abstractFileExtractionFormat;
|
||||
|
||||
MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
|
||||
DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
|
||||
|
||||
this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
|
||||
this.services = IngestServices.getInstance();
|
||||
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
case XLS:
|
||||
listOfExtractedImages = extractImagesFromXls(abstractFile);
|
||||
break;
|
||||
case PDF:
|
||||
listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -470,6 +474,38 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
return listOfExtractedImages;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts embedded attachments from PDF files.
|
||||
*
|
||||
* @param abstractFile Input PDF file
|
||||
* @return List of extracted files to be made into derived file instances.
|
||||
*/
|
||||
private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
|
||||
PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
|
||||
try {
|
||||
Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
|
||||
//Get map of attachment name -> location disk.
|
||||
Map<String, Path> extractedAttachments = pdfExtractor.extract(
|
||||
new ReadContentInputStream(abstractFile), abstractFile.getId(),
|
||||
outputDirectory);
|
||||
|
||||
//Convert output to hook into the existing logic for creating derived files
|
||||
List<ExtractedFile> extractedFiles = new ArrayList<>();
|
||||
extractedAttachments.entrySet().forEach((pathEntry) -> {
|
||||
String fileName = pathEntry.getKey();
|
||||
Path writeLocation = pathEntry.getValue();
|
||||
extractedFiles.add(new ExtractedFile(fileName,
|
||||
getFileRelativePath(writeLocation.getFileName().toString()),
|
||||
writeLocation.toFile().length()));
|
||||
});
|
||||
|
||||
return extractedFiles;
|
||||
} catch (IOException | SAXException | TikaException ex) {
|
||||
LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes image to the module output location.
|
@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
||||
//Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
|
||||
private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
|
||||
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
|
||||
private MSOfficeEmbeddedContentExtractor officeExtractor;
|
||||
private DocumentEmbeddedContentExtractor documentExtractor;
|
||||
private SevenZipExtractor archiveExtractor;
|
||||
private FileTypeDetector fileTypeDetector;
|
||||
private long jobId;
|
||||
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
||||
}
|
||||
/*
|
||||
* Construct an embedded content extractor for processing Microsoft
|
||||
* Office documents.
|
||||
* Office documents and PDF documents.
|
||||
*/
|
||||
try {
|
||||
this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
|
||||
this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
|
||||
} catch (NoCurrentCaseException ex) {
|
||||
throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
|
||||
}
|
||||
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
||||
*/
|
||||
if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
|
||||
archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
|
||||
} else if (officeExtractor.isContentExtractionSupported(abstractFile)) {
|
||||
officeExtractor.extractEmbeddedContent(abstractFile);
|
||||
} else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
|
||||
documentExtractor.extractEmbeddedContent(abstractFile);
|
||||
}
|
||||
return ProcessResult.OK;
|
||||
}
|
||||
|
@ -0,0 +1,183 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.modules.embeddedfileextractor;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.EncodedFileOutputStream;
|
||||
import org.sleuthkit.datamodel.TskData;
|
||||
|
||||
/**
|
||||
* Facility for extracting and storing attachments from PDF documents.
|
||||
* Implementation specifics, however, are generic enough to be used on any
|
||||
* document with embedded resources. The current name reflects the only known
|
||||
* use case for this class.
|
||||
*/
|
||||
final class PDFAttachmentExtractor {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
|
||||
private final AutoDetectParser parser;
|
||||
|
||||
public PDFAttachmentExtractor() {
|
||||
parser = new AutoDetectParser();
|
||||
}
|
||||
|
||||
public PDFAttachmentExtractor(AutoDetectParser parser) {
|
||||
this.parser = parser;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts PDF attachments from a given input and writes them to the supplied
|
||||
* output directory.
|
||||
*
|
||||
* @param input Input PDF to extract attachments from
|
||||
* @param parentID ID for unique extraction names
|
||||
* @param outputDir Directory to write attachments
|
||||
* @return Map containing file name -> location on disk
|
||||
* @throws IOException
|
||||
* @throws SAXException
|
||||
* @throws TikaException
|
||||
*/
|
||||
public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
|
||||
ExtractionPreconditions.checkArgument(Files.exists(outputDir),
|
||||
String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
|
||||
|
||||
ParseContext parseContext = new ParseContext();
|
||||
parseContext.set(Parser.class, parser);
|
||||
|
||||
//Keep track of the attachment files as they are being extracted and written to disk.
|
||||
NewResourceWatcher watcher = new NewResourceWatcher();
|
||||
parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
|
||||
|
||||
//Parse input with default params, except for our ParseContext
|
||||
parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
|
||||
|
||||
return watcher.getSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal Tika class that is invoked upon encountering an embedded
|
||||
* resource.
|
||||
*/
|
||||
static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
|
||||
|
||||
private final Path outputDirectory;
|
||||
private final NewResourceWatcher watcher;
|
||||
private final Long parentID;
|
||||
private Integer attachmentCount;
|
||||
|
||||
public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
|
||||
this.outputDirectory = outputDirectory;
|
||||
this.watcher = watcher;
|
||||
this.parentID = parentID;
|
||||
attachmentCount = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldParseEmbedded(Metadata mtdt) {
|
||||
//Grab every available attachment
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
|
||||
//Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
|
||||
String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
|
||||
|
||||
String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
|
||||
String ext = FilenameUtils.getExtension(name);
|
||||
|
||||
//Append the extension if we can.
|
||||
if(ext == null) {
|
||||
name = uniqueExtractedName;
|
||||
} else if(!ext.isEmpty()) {
|
||||
uniqueExtractedName += "." + ext;
|
||||
}
|
||||
|
||||
Path outputFile = outputDirectory.resolve(uniqueExtractedName);
|
||||
|
||||
try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
|
||||
new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
|
||||
IOUtils.copy(in, outputStream);
|
||||
watcher.notify(name, outputFile);
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
|
||||
uniqueExtractedName, outputFile), ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenient wrapper for keeping track of new resource paths and the display
|
||||
* name for each of these resources.
|
||||
*
|
||||
* It is necessary to maintain a snapshot of only our changes when the
|
||||
* output directory is shared among other processes/threads.
|
||||
*/
|
||||
static class NewResourceWatcher {
|
||||
|
||||
private final Map<String, Path> newResourcePaths;
|
||||
|
||||
public NewResourceWatcher() {
|
||||
newResourcePaths = new HashMap<>();
|
||||
}
|
||||
|
||||
public void notify(String name, Path newResource) {
|
||||
newResourcePaths.put(name, newResource);
|
||||
}
|
||||
|
||||
public Map<String, Path> getSnapshot() {
|
||||
return newResourcePaths;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Static convenience methods that ensure the PDF extractor is being invoked
|
||||
* correctly.
|
||||
*/
|
||||
static class ExtractionPreconditions {
|
||||
|
||||
public static void checkArgument(boolean expression, String msg) throws IOException {
|
||||
if (!expression) {
|
||||
throw new IOException(msg);
|
||||
}
|
||||
}
|
||||
|
||||
private ExtractionPreconditions(){
|
||||
}
|
||||
}
|
||||
}
|
@ -46,6 +46,7 @@ import org.apache.tika.Tika;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.EmptyParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.ParsingReader;
|
||||
@ -125,6 +126,16 @@ final class TikaTextExtractor implements TextExtractor {
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
//Tika should ignore types with embedded files that can be handled by the unpacking modules
|
||||
private static final List<String> EMBEDDED_FILE_MIME_TYPES
|
||||
= ImmutableList.of("application/msword", //NON-NLS
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
|
||||
"application/vnd.ms-powerpoint", //NON-NLS
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
|
||||
"application/vnd.ms-excel", //NON-NLS
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
|
||||
"application/pdf"); //NON-NLS
|
||||
|
||||
private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
||||
private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
|
||||
|
||||
@ -184,7 +195,14 @@ final class TikaTextExtractor implements TextExtractor {
|
||||
InputStream stream = null;
|
||||
|
||||
ParseContext parseContext = new ParseContext();
|
||||
parseContext.set(Parser.class, parser);
|
||||
|
||||
//Disable appending embedded file text to output for EFE supported types
|
||||
//JIRA-4975
|
||||
if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
|
||||
parseContext.set(Parser.class, new EmptyParser());
|
||||
} else {
|
||||
parseContext.set(Parser.class, parser);
|
||||
}
|
||||
|
||||
if (ocrEnabled() && content instanceof AbstractFile) {
|
||||
AbstractFile file = ((AbstractFile) content);
|
||||
@ -516,11 +534,11 @@ final class TikaTextExtractor implements TextExtractor {
|
||||
if (context != null) {
|
||||
ImageConfig configInstance = context.lookup(ImageConfig.class);
|
||||
if (configInstance != null) {
|
||||
if(Objects.nonNull(configInstance.getOCREnabled())) {
|
||||
if (Objects.nonNull(configInstance.getOCREnabled())) {
|
||||
this.tesseractOCREnabled = configInstance.getOCREnabled();
|
||||
}
|
||||
|
||||
if(Objects.nonNull(configInstance.getOCRLanguages())) {
|
||||
|
||||
if (Objects.nonNull(configInstance.getOCRLanguages())) {
|
||||
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user