mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 10:17:41 +00:00
Added support for extracting PDF attachments in EFE module and disabled embedded content extraction for Tika so that we do not duplicate solr text for documents supported by EFE
This commit is contained in:
parent
237ea66025
commit
8a846b4937
@ -22,8 +22,10 @@ import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
|
||||
import org.apache.poi.hwpf.usermodel.Picture;
|
||||
import org.apache.poi.hslf.usermodel.HSLFPictureData;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.model.PicturesTable;
|
||||
import org.apache.poi.sl.usermodel.PictureData.PictureType;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
import org.apache.poi.util.RecordFormatException;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.detect.Detector;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;
|
||||
|
||||
/**
|
||||
* Extracts embedded content (e.g. images, audio, video) from Microsoft Office
|
||||
* documents (both original and OOXML forms).
|
||||
* documents (both original and OOXML forms) and PDF documents.
|
||||
*/
|
||||
class MSOfficeEmbeddedContentExtractor {
|
||||
class DocumentEmbeddedContentExtractor {
|
||||
|
||||
private final FileManager fileManager;
|
||||
private final IngestServices services;
|
||||
private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
|
||||
private static final Logger LOGGER = Logger.getLogger(EmbeddedDocumentExtractor.class.getName());
|
||||
private final IngestJobContext context;
|
||||
private String parentFileName;
|
||||
private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
|
||||
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
PPT("application/vnd.ms-powerpoint"), //NON-NLS
|
||||
PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
|
||||
XLS("application/vnd.ms-excel"), //NON-NLS
|
||||
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
|
||||
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
|
||||
PDF("application/pdf"); //NON-NLS
|
||||
|
||||
private final String mimeType;
|
||||
|
||||
@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
}
|
||||
private SupportedExtractionFormats abstractFileExtractionFormat;
|
||||
|
||||
MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
|
||||
DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
|
||||
|
||||
this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
|
||||
this.services = IngestServices.getInstance();
|
||||
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
case XLS:
|
||||
listOfExtractedImages = extractImagesFromXls(abstractFile);
|
||||
break;
|
||||
case PDF:
|
||||
listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -470,6 +474,37 @@ class MSOfficeEmbeddedContentExtractor {
|
||||
return listOfExtractedImages;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param abstractFile
|
||||
* @return
|
||||
*/
|
||||
private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
|
||||
PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
|
||||
try {
|
||||
Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
|
||||
//Get map of attachment name -> location disk.
|
||||
Map<String, Path> extractedAttachments = pdfExtractor.extract(
|
||||
new ReadContentInputStream(abstractFile), abstractFile.getId(),
|
||||
outputDirectory);
|
||||
|
||||
//Convert output to hook into the existing logic for creating derived files
|
||||
List<ExtractedFile> extractedFiles = new ArrayList<>();
|
||||
extractedAttachments.entrySet().forEach((pathEntry) -> {
|
||||
String fileName = pathEntry.getKey();
|
||||
Path writeLocation = pathEntry.getValue();
|
||||
extractedFiles.add(new ExtractedFile(fileName,
|
||||
getFileRelativePath(writeLocation.getFileName().toString()),
|
||||
writeLocation.toFile().length()));
|
||||
});
|
||||
|
||||
return extractedFiles;
|
||||
} catch (IOException | SAXException | TikaException ex) {
|
||||
LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes image to the module output location.
|
@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
||||
//Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
|
||||
private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
|
||||
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
|
||||
private MSOfficeEmbeddedContentExtractor officeExtractor;
|
||||
private DocumentEmbeddedContentExtractor documentExtractor;
|
||||
private SevenZipExtractor archiveExtractor;
|
||||
private FileTypeDetector fileTypeDetector;
|
||||
private long jobId;
|
||||
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
||||
}
|
||||
/*
|
||||
* Construct an embedded content extractor for processing Microsoft
|
||||
* Office documents.
|
||||
* Office documents and PDF documents.
|
||||
*/
|
||||
try {
|
||||
this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
|
||||
this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
|
||||
} catch (NoCurrentCaseException ex) {
|
||||
throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
|
||||
}
|
||||
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
||||
*/
|
||||
if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
|
||||
archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
|
||||
} else if (officeExtractor.isContentExtractionSupported(abstractFile)) {
|
||||
officeExtractor.extractEmbeddedContent(abstractFile);
|
||||
} else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
|
||||
documentExtractor.extractEmbeddedContent(abstractFile);
|
||||
}
|
||||
return ProcessResult.OK;
|
||||
}
|
||||
|
@ -0,0 +1,179 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.modules.embeddedfileextractor;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.EncodedFileOutputStream;
|
||||
import org.sleuthkit.datamodel.TskData;
|
||||
|
||||
/**
|
||||
* Facility for extracting and storing attachments from PDF documents.
|
||||
* Implementation specifics, however, are generic enough to be used on any
|
||||
* document with embedded resources. The current name reflects the only known
|
||||
* use case for this class.
|
||||
*/
|
||||
final class PDFAttachmentExtractor {
|
||||
|
||||
private static Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
|
||||
private final AutoDetectParser parser;
|
||||
|
||||
public PDFAttachmentExtractor() {
|
||||
parser = new AutoDetectParser();
|
||||
}
|
||||
|
||||
public PDFAttachmentExtractor(AutoDetectParser parser) {
|
||||
this.parser = parser;
|
||||
}
|
||||
|
||||
/**
|
||||
* The public endpoint
|
||||
*
|
||||
* @param input
|
||||
* @param parentID
|
||||
* @param outputDir
|
||||
* @return
|
||||
* @throws IOException
|
||||
* @throws SAXException
|
||||
* @throws TikaException
|
||||
*/
|
||||
public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
|
||||
ExtractionPreconditions.checkArgument(Files.exists(outputDir),
|
||||
String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
|
||||
|
||||
ParseContext parseContext = new ParseContext();
|
||||
parseContext.set(Parser.class, parser);
|
||||
|
||||
//Keep track of the attachment files as they are being extracted and written to disk.
|
||||
NewResourceWatcher watcher = new NewResourceWatcher();
|
||||
parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
|
||||
|
||||
//Parse input with default params, except for our ParseContext
|
||||
parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
|
||||
|
||||
return watcher.getSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal Tika class that is invoked upon encountering an embedded
|
||||
* resource.
|
||||
*/
|
||||
static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
|
||||
|
||||
private final Path outputDirectory;
|
||||
private final NewResourceWatcher watcher;
|
||||
private final Long parentID;
|
||||
private Integer attachmentCount;
|
||||
|
||||
public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
|
||||
this.outputDirectory = outputDirectory;
|
||||
this.watcher = watcher;
|
||||
this.parentID = parentID;
|
||||
attachmentCount = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldParseEmbedded(Metadata mtdt) {
|
||||
//Grab every available attachment
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
|
||||
//Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
|
||||
String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
|
||||
|
||||
String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
|
||||
String ext = FilenameUtils.getExtension(name);
|
||||
|
||||
//Append the extension if we can.
|
||||
if(ext == null) {
|
||||
name = uniqueExtractedName;
|
||||
} else if(!ext.isEmpty()) {
|
||||
uniqueExtractedName += "." + ext;
|
||||
}
|
||||
|
||||
Path outputFile = outputDirectory.resolve(uniqueExtractedName);
|
||||
|
||||
try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
|
||||
new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
|
||||
IOUtils.copy(in, outputStream);
|
||||
watcher.notify(name, outputFile);
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
|
||||
uniqueExtractedName, outputFile), ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenient wrapper for keeping track of new resource paths and the display
|
||||
* name for each of these resources.
|
||||
*
|
||||
* It is necessary to maintain a snapshot of only our changes when the
|
||||
* output directory is shared among other processes/threads.
|
||||
*/
|
||||
static class NewResourceWatcher {
|
||||
|
||||
private final Map<String, Path> newResourcePaths;
|
||||
|
||||
public NewResourceWatcher() {
|
||||
newResourcePaths = new HashMap<>();
|
||||
}
|
||||
|
||||
public void notify(String name, Path newResource) {
|
||||
newResourcePaths.put(name, newResource);
|
||||
}
|
||||
|
||||
public Map<String, Path> getSnapshot() {
|
||||
return newResourcePaths;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Static convenience methods that ensure the PDF extractor is being invoked
|
||||
* correctly.
|
||||
*/
|
||||
static class ExtractionPreconditions {
|
||||
|
||||
public static void checkArgument(boolean expression, String msg) throws IOException {
|
||||
if (!expression) {
|
||||
throw new IOException(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -44,6 +44,7 @@ import java.util.stream.Stream;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.EmptyParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.ParsingReader;
|
||||
@ -177,7 +178,10 @@ final class TikaTextExtractor implements TextExtractor {
|
||||
InputStream stream = null;
|
||||
|
||||
ParseContext parseContext = new ParseContext();
|
||||
parseContext.set(Parser.class, parser);
|
||||
|
||||
//Disable appending embedded file text to output
|
||||
//JIRA-4975
|
||||
parseContext.set(Parser.class, new EmptyParser());
|
||||
|
||||
if (ocrEnabled() && content instanceof AbstractFile) {
|
||||
AbstractFile file = ((AbstractFile) content);
|
||||
|
Loading…
x
Reference in New Issue
Block a user