Added support for extracting PDF attachments in EFE module and disabled embedded content extraction for Tika so that we do not duplicate solr text for documents supported by EFE

This commit is contained in:
U-BASIS\dsmyda 2019-04-30 12:14:47 -04:00
parent 237ea66025
commit 8a846b4937
4 changed files with 231 additions and 13 deletions

View File

@ -22,8 +22,10 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hslf.usermodel.HSLFPictureData;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.sl.usermodel.PictureData.PictureType;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.util.RecordFormatException;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;
/**
* Extracts embedded content (e.g. images, audio, video) from Microsoft Office
* documents (both original and OOXML forms).
* documents (both original and OOXML forms) and PDF documents.
*/
class MSOfficeEmbeddedContentExtractor {
class DocumentEmbeddedContentExtractor {
private final FileManager fileManager;
private final IngestServices services;
private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
private static final Logger LOGGER = Logger.getLogger(EmbeddedDocumentExtractor.class.getName());
private final IngestJobContext context;
private String parentFileName;
private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
PPT("application/vnd.ms-powerpoint"), //NON-NLS
PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
XLS("application/vnd.ms-excel"), //NON-NLS
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
PDF("application/pdf"); //NON-NLS
private final String mimeType;
@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
}
private SupportedExtractionFormats abstractFileExtractionFormat;
MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
this.services = IngestServices.getInstance();
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
case XLS:
listOfExtractedImages = extractImagesFromXls(abstractFile);
break;
case PDF:
listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
break;
default:
break;
}
@ -470,6 +474,37 @@ class MSOfficeEmbeddedContentExtractor {
return listOfExtractedImages;
}
/**
*
* @param abstractFile
* @return
*/
private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
try {
Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
//Get map of attachment name -> location disk.
Map<String, Path> extractedAttachments = pdfExtractor.extract(
new ReadContentInputStream(abstractFile), abstractFile.getId(),
outputDirectory);
//Convert output to hook into the existing logic for creating derived files
List<ExtractedFile> extractedFiles = new ArrayList<>();
extractedAttachments.entrySet().forEach((pathEntry) -> {
String fileName = pathEntry.getKey();
Path writeLocation = pathEntry.getValue();
extractedFiles.add(new ExtractedFile(fileName,
getFileRelativePath(writeLocation.getFileName().toString()),
writeLocation.toFile().length()));
});
return extractedFiles;
} catch (IOException | SAXException | TikaException ex) {
LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
}
return Collections.emptyList();
}
/**
* Writes image to the module output location.

View File

@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
//Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
private MSOfficeEmbeddedContentExtractor officeExtractor;
private DocumentEmbeddedContentExtractor documentExtractor;
private SevenZipExtractor archiveExtractor;
private FileTypeDetector fileTypeDetector;
private long jobId;
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
}
/*
* Construct an embedded content extractor for processing Microsoft
* Office documents.
* Office documents and PDF documents.
*/
try {
this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
} catch (NoCurrentCaseException ex) {
throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
}
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
*/
if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
} else if (officeExtractor.isContentExtractionSupported(abstractFile)) {
officeExtractor.extractEmbeddedContent(abstractFile);
} else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
documentExtractor.extractEmbeddedContent(abstractFile);
}
return ProcessResult.OK;
}

View File

@ -0,0 +1,179 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2019 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.modules.embeddedfileextractor;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.EncodedFileOutputStream;
import org.sleuthkit.datamodel.TskData;
/**
* Facility for extracting and storing attachments from PDF documents.
* Implementation specifics, however, are generic enough to be used on any
* document with embedded resources. The current name reflects the only known
* use case for this class.
*/
final class PDFAttachmentExtractor {
private static Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
private final AutoDetectParser parser;
public PDFAttachmentExtractor() {
parser = new AutoDetectParser();
}
public PDFAttachmentExtractor(AutoDetectParser parser) {
this.parser = parser;
}
/**
* The public endpoint
*
* @param input
* @param parentID
* @param outputDir
* @return
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
ExtractionPreconditions.checkArgument(Files.exists(outputDir),
String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
//Keep track of the attachment files as they are being extracted and written to disk.
NewResourceWatcher watcher = new NewResourceWatcher();
parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
//Parse input with default params, except for our ParseContext
parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
return watcher.getSnapshot();
}
/**
* Internal Tika class that is invoked upon encountering an embedded
* resource.
*/
static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
private final Path outputDirectory;
private final NewResourceWatcher watcher;
private final Long parentID;
private Integer attachmentCount;
public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
this.outputDirectory = outputDirectory;
this.watcher = watcher;
this.parentID = parentID;
attachmentCount = 0;
}
@Override
public boolean shouldParseEmbedded(Metadata mtdt) {
//Grab every available attachment
return true;
}
@Override
public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
//Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
String ext = FilenameUtils.getExtension(name);
//Append the extension if we can.
if(ext == null) {
name = uniqueExtractedName;
} else if(!ext.isEmpty()) {
uniqueExtractedName += "." + ext;
}
Path outputFile = outputDirectory.resolve(uniqueExtractedName);
try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
IOUtils.copy(in, outputStream);
watcher.notify(name, outputFile);
} catch (IOException ex) {
logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
uniqueExtractedName, outputFile), ex);
}
}
}
/**
* Convenient wrapper for keeping track of new resource paths and the display
* name for each of these resources.
*
* It is necessary to maintain a snapshot of only our changes when the
* output directory is shared among other processes/threads.
*/
static class NewResourceWatcher {
private final Map<String, Path> newResourcePaths;
public NewResourceWatcher() {
newResourcePaths = new HashMap<>();
}
public void notify(String name, Path newResource) {
newResourcePaths.put(name, newResource);
}
public Map<String, Path> getSnapshot() {
return newResourcePaths;
}
}
/**
* Static convenience methods that ensure the PDF extractor is being invoked
* correctly.
*/
static class ExtractionPreconditions {
public static void checkArgument(boolean expression, String msg) throws IOException {
if (!expression) {
throw new IOException(msg);
}
}
}
}

View File

@ -44,6 +44,7 @@ import java.util.stream.Stream;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader;
@ -177,7 +178,10 @@ final class TikaTextExtractor implements TextExtractor {
InputStream stream = null;
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
//Disable appending embedded file text to output
//JIRA-4975
parseContext.set(Parser.class, new EmptyParser());
if (ocrEnabled() && content instanceof AbstractFile) {
AbstractFile file = ((AbstractFile) content);