mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-13 00:16:16 +00:00
Merge pull request #4758 from dannysmyda/4975-PDF-Attachment-Extractor
4975 pdf attachment extractor
This commit is contained in:
commit
0cff372c7d
@ -22,8 +22,10 @@ import java.io.File;
|
|||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -33,13 +35,11 @@ import org.apache.commons.io.IOUtils;
|
|||||||
import org.apache.poi.hwpf.usermodel.Picture;
|
import org.apache.poi.hwpf.usermodel.Picture;
|
||||||
import org.apache.poi.hslf.usermodel.HSLFPictureData;
|
import org.apache.poi.hslf.usermodel.HSLFPictureData;
|
||||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||||
import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
|
|
||||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
import org.apache.poi.hwpf.model.PicturesTable;
|
import org.apache.poi.hwpf.model.PicturesTable;
|
||||||
import org.apache.poi.sl.usermodel.PictureData.PictureType;
|
import org.apache.poi.sl.usermodel.PictureData.PictureType;
|
||||||
import org.apache.poi.ss.usermodel.Workbook;
|
import org.apache.poi.ss.usermodel.Workbook;
|
||||||
import org.apache.poi.util.RecordFormatException;
|
|
||||||
import org.apache.tika.config.TikaConfig;
|
import org.apache.tika.config.TikaConfig;
|
||||||
import org.apache.tika.detect.Detector;
|
import org.apache.tika.detect.Detector;
|
||||||
import org.apache.tika.exception.TikaException;
|
import org.apache.tika.exception.TikaException;
|
||||||
@ -72,13 +72,13 @@ import org.xml.sax.SAXException;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts embedded content (e.g. images, audio, video) from Microsoft Office
|
* Extracts embedded content (e.g. images, audio, video) from Microsoft Office
|
||||||
* documents (both original and OOXML forms).
|
* documents (both original and OOXML forms) and PDF documents.
|
||||||
*/
|
*/
|
||||||
class MSOfficeEmbeddedContentExtractor {
|
class DocumentEmbeddedContentExtractor {
|
||||||
|
|
||||||
private final FileManager fileManager;
|
private final FileManager fileManager;
|
||||||
private final IngestServices services;
|
private final IngestServices services;
|
||||||
private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
|
private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
|
||||||
private final IngestJobContext context;
|
private final IngestJobContext context;
|
||||||
private String parentFileName;
|
private String parentFileName;
|
||||||
private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
|
private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
|
||||||
@ -101,7 +101,8 @@ class MSOfficeEmbeddedContentExtractor {
|
|||||||
PPT("application/vnd.ms-powerpoint"), //NON-NLS
|
PPT("application/vnd.ms-powerpoint"), //NON-NLS
|
||||||
PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
|
PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
|
||||||
XLS("application/vnd.ms-excel"), //NON-NLS
|
XLS("application/vnd.ms-excel"), //NON-NLS
|
||||||
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
|
XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
|
||||||
|
PDF("application/pdf"); //NON-NLS
|
||||||
|
|
||||||
private final String mimeType;
|
private final String mimeType;
|
||||||
|
|
||||||
@ -116,7 +117,7 @@ class MSOfficeEmbeddedContentExtractor {
|
|||||||
}
|
}
|
||||||
private SupportedExtractionFormats abstractFileExtractionFormat;
|
private SupportedExtractionFormats abstractFileExtractionFormat;
|
||||||
|
|
||||||
MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
|
DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
|
||||||
|
|
||||||
this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
|
this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
|
||||||
this.services = IngestServices.getInstance();
|
this.services = IngestServices.getInstance();
|
||||||
@ -190,6 +191,9 @@ class MSOfficeEmbeddedContentExtractor {
|
|||||||
case XLS:
|
case XLS:
|
||||||
listOfExtractedImages = extractImagesFromXls(abstractFile);
|
listOfExtractedImages = extractImagesFromXls(abstractFile);
|
||||||
break;
|
break;
|
||||||
|
case PDF:
|
||||||
|
listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -470,6 +474,38 @@ class MSOfficeEmbeddedContentExtractor {
|
|||||||
return listOfExtractedImages;
|
return listOfExtractedImages;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts embedded attachments from PDF files.
|
||||||
|
*
|
||||||
|
* @param abstractFile Input PDF file
|
||||||
|
* @return List of extracted files to be made into derived file instances.
|
||||||
|
*/
|
||||||
|
private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
|
||||||
|
PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
|
||||||
|
try {
|
||||||
|
Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
|
||||||
|
//Get map of attachment name -> location disk.
|
||||||
|
Map<String, Path> extractedAttachments = pdfExtractor.extract(
|
||||||
|
new ReadContentInputStream(abstractFile), abstractFile.getId(),
|
||||||
|
outputDirectory);
|
||||||
|
|
||||||
|
//Convert output to hook into the existing logic for creating derived files
|
||||||
|
List<ExtractedFile> extractedFiles = new ArrayList<>();
|
||||||
|
extractedAttachments.entrySet().forEach((pathEntry) -> {
|
||||||
|
String fileName = pathEntry.getKey();
|
||||||
|
Path writeLocation = pathEntry.getValue();
|
||||||
|
extractedFiles.add(new ExtractedFile(fileName,
|
||||||
|
getFileRelativePath(writeLocation.getFileName().toString()),
|
||||||
|
writeLocation.toFile().length()));
|
||||||
|
});
|
||||||
|
|
||||||
|
return extractedFiles;
|
||||||
|
} catch (IOException | SAXException | TikaException ex) {
|
||||||
|
LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs", ex); //NON-NLS
|
||||||
|
}
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes image to the module output location.
|
* Writes image to the module output location.
|
@ -50,7 +50,7 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
|||||||
//Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
|
//Outer concurrent hashmap with keys of JobID, inner concurrentHashmap with keys of objectID
|
||||||
private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
|
private static final ConcurrentHashMap<Long, ConcurrentHashMap<Long, Archive>> mapOfDepthTrees = new ConcurrentHashMap<>();
|
||||||
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
|
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
|
||||||
private MSOfficeEmbeddedContentExtractor officeExtractor;
|
private DocumentEmbeddedContentExtractor documentExtractor;
|
||||||
private SevenZipExtractor archiveExtractor;
|
private SevenZipExtractor archiveExtractor;
|
||||||
private FileTypeDetector fileTypeDetector;
|
private FileTypeDetector fileTypeDetector;
|
||||||
private long jobId;
|
private long jobId;
|
||||||
@ -115,10 +115,10 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
|||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
* Construct an embedded content extractor for processing Microsoft
|
* Construct an embedded content extractor for processing Microsoft
|
||||||
* Office documents.
|
* Office documents and PDF documents.
|
||||||
*/
|
*/
|
||||||
try {
|
try {
|
||||||
this.officeExtractor = new MSOfficeEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
|
this.documentExtractor = new DocumentEmbeddedContentExtractor(context, fileTypeDetector, moduleDirRelative, moduleDirAbsolute);
|
||||||
} catch (NoCurrentCaseException ex) {
|
} catch (NoCurrentCaseException ex) {
|
||||||
throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
|
throw new IngestModuleException(Bundle.EmbeddedFileExtractorIngestModule_UnableToGetMSOfficeExtractor_errMsg(), ex);
|
||||||
}
|
}
|
||||||
@ -155,8 +155,8 @@ public final class EmbeddedFileExtractorIngestModule extends FileIngestModuleAda
|
|||||||
*/
|
*/
|
||||||
if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
|
if (archiveExtractor.isSevenZipExtractionSupported(abstractFile)) {
|
||||||
archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
|
archiveExtractor.unpack(abstractFile, mapOfDepthTrees.get(jobId));
|
||||||
} else if (officeExtractor.isContentExtractionSupported(abstractFile)) {
|
} else if (documentExtractor.isContentExtractionSupported(abstractFile)) {
|
||||||
officeExtractor.extractEmbeddedContent(abstractFile);
|
documentExtractor.extractEmbeddedContent(abstractFile);
|
||||||
}
|
}
|
||||||
return ProcessResult.OK;
|
return ProcessResult.OK;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,183 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2019 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.modules.embeddedfileextractor;
|
||||||
|
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import org.apache.commons.io.FilenameUtils;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.tika.exception.TikaException;
|
||||||
|
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
|
import org.xml.sax.ContentHandler;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
|
import org.sleuthkit.datamodel.EncodedFileOutputStream;
|
||||||
|
import org.sleuthkit.datamodel.TskData;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Facility for extracting and storing attachments from PDF documents.
|
||||||
|
* Implementation specifics, however, are generic enough to be used on any
|
||||||
|
* document with embedded resources. The current name reflects the only known
|
||||||
|
* use case for this class.
|
||||||
|
*/
|
||||||
|
final class PDFAttachmentExtractor {
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
|
||||||
|
private final AutoDetectParser parser;
|
||||||
|
|
||||||
|
public PDFAttachmentExtractor() {
|
||||||
|
parser = new AutoDetectParser();
|
||||||
|
}
|
||||||
|
|
||||||
|
public PDFAttachmentExtractor(AutoDetectParser parser) {
|
||||||
|
this.parser = parser;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts PDF attachments from a given input and writes them to the supplied
|
||||||
|
* output directory.
|
||||||
|
*
|
||||||
|
* @param input Input PDF to extract attachments from
|
||||||
|
* @param parentID ID for unique extraction names
|
||||||
|
* @param outputDir Directory to write attachments
|
||||||
|
* @return Map containing file name -> location on disk
|
||||||
|
* @throws IOException
|
||||||
|
* @throws SAXException
|
||||||
|
* @throws TikaException
|
||||||
|
*/
|
||||||
|
public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
|
||||||
|
ExtractionPreconditions.checkArgument(Files.exists(outputDir),
|
||||||
|
String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
|
||||||
|
|
||||||
|
ParseContext parseContext = new ParseContext();
|
||||||
|
parseContext.set(Parser.class, parser);
|
||||||
|
|
||||||
|
//Keep track of the attachment files as they are being extracted and written to disk.
|
||||||
|
NewResourceWatcher watcher = new NewResourceWatcher();
|
||||||
|
parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
|
||||||
|
|
||||||
|
//Parse input with default params, except for our ParseContext
|
||||||
|
parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
|
||||||
|
|
||||||
|
return watcher.getSnapshot();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal Tika class that is invoked upon encountering an embedded
|
||||||
|
* resource.
|
||||||
|
*/
|
||||||
|
static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
|
||||||
|
|
||||||
|
private final Path outputDirectory;
|
||||||
|
private final NewResourceWatcher watcher;
|
||||||
|
private final Long parentID;
|
||||||
|
private Integer attachmentCount;
|
||||||
|
|
||||||
|
public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
|
||||||
|
this.outputDirectory = outputDirectory;
|
||||||
|
this.watcher = watcher;
|
||||||
|
this.parentID = parentID;
|
||||||
|
attachmentCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean shouldParseEmbedded(Metadata mtdt) {
|
||||||
|
//Grab every available attachment
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
|
||||||
|
//Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
|
||||||
|
String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
|
||||||
|
|
||||||
|
String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
|
||||||
|
String ext = FilenameUtils.getExtension(name);
|
||||||
|
|
||||||
|
//Append the extension if we can.
|
||||||
|
if(ext == null) {
|
||||||
|
name = uniqueExtractedName;
|
||||||
|
} else if(!ext.isEmpty()) {
|
||||||
|
uniqueExtractedName += "." + ext;
|
||||||
|
}
|
||||||
|
|
||||||
|
Path outputFile = outputDirectory.resolve(uniqueExtractedName);
|
||||||
|
|
||||||
|
try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
|
||||||
|
new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
|
||||||
|
IOUtils.copy(in, outputStream);
|
||||||
|
watcher.notify(name, outputFile);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
|
||||||
|
uniqueExtractedName, outputFile), ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convenient wrapper for keeping track of new resource paths and the display
|
||||||
|
* name for each of these resources.
|
||||||
|
*
|
||||||
|
* It is necessary to maintain a snapshot of only our changes when the
|
||||||
|
* output directory is shared among other processes/threads.
|
||||||
|
*/
|
||||||
|
static class NewResourceWatcher {
|
||||||
|
|
||||||
|
private final Map<String, Path> newResourcePaths;
|
||||||
|
|
||||||
|
public NewResourceWatcher() {
|
||||||
|
newResourcePaths = new HashMap<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void notify(String name, Path newResource) {
|
||||||
|
newResourcePaths.put(name, newResource);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Path> getSnapshot() {
|
||||||
|
return newResourcePaths;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Static convenience methods that ensure the PDF extractor is being invoked
|
||||||
|
* correctly.
|
||||||
|
*/
|
||||||
|
static class ExtractionPreconditions {
|
||||||
|
|
||||||
|
public static void checkArgument(boolean expression, String msg) throws IOException {
|
||||||
|
if (!expression) {
|
||||||
|
throw new IOException(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private ExtractionPreconditions(){
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -46,6 +46,7 @@ import org.apache.tika.Tika;
|
|||||||
import org.apache.tika.exception.TikaException;
|
import org.apache.tika.exception.TikaException;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.parser.AutoDetectParser;
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
|
import org.apache.tika.parser.EmptyParser;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
import org.apache.tika.parser.Parser;
|
import org.apache.tika.parser.Parser;
|
||||||
import org.apache.tika.parser.ParsingReader;
|
import org.apache.tika.parser.ParsingReader;
|
||||||
@ -125,6 +126,16 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
"application/x-z", //NON-NLS
|
"application/x-z", //NON-NLS
|
||||||
"application/x-compress"); //NON-NLS
|
"application/x-compress"); //NON-NLS
|
||||||
|
|
||||||
|
//Tika should ignore types with embedded files that can be handled by the unpacking modules
|
||||||
|
private static final List<String> EMBEDDED_FILE_MIME_TYPES
|
||||||
|
= ImmutableList.of("application/msword", //NON-NLS
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
|
||||||
|
"application/vnd.ms-powerpoint", //NON-NLS
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
|
||||||
|
"application/vnd.ms-excel", //NON-NLS
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
|
||||||
|
"application/pdf"); //NON-NLS
|
||||||
|
|
||||||
private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
||||||
private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
|
private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
|
||||||
|
|
||||||
@ -184,7 +195,14 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
InputStream stream = null;
|
InputStream stream = null;
|
||||||
|
|
||||||
ParseContext parseContext = new ParseContext();
|
ParseContext parseContext = new ParseContext();
|
||||||
parseContext.set(Parser.class, parser);
|
|
||||||
|
//Disable appending embedded file text to output for EFE supported types
|
||||||
|
//JIRA-4975
|
||||||
|
if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
|
||||||
|
parseContext.set(Parser.class, new EmptyParser());
|
||||||
|
} else {
|
||||||
|
parseContext.set(Parser.class, parser);
|
||||||
|
}
|
||||||
|
|
||||||
if (ocrEnabled() && content instanceof AbstractFile) {
|
if (ocrEnabled() && content instanceof AbstractFile) {
|
||||||
AbstractFile file = ((AbstractFile) content);
|
AbstractFile file = ((AbstractFile) content);
|
||||||
@ -516,11 +534,11 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
if (context != null) {
|
if (context != null) {
|
||||||
ImageConfig configInstance = context.lookup(ImageConfig.class);
|
ImageConfig configInstance = context.lookup(ImageConfig.class);
|
||||||
if (configInstance != null) {
|
if (configInstance != null) {
|
||||||
if(Objects.nonNull(configInstance.getOCREnabled())) {
|
if (Objects.nonNull(configInstance.getOCREnabled())) {
|
||||||
this.tesseractOCREnabled = configInstance.getOCREnabled();
|
this.tesseractOCREnabled = configInstance.getOCREnabled();
|
||||||
}
|
}
|
||||||
|
|
||||||
if(Objects.nonNull(configInstance.getOCRLanguages())) {
|
if (Objects.nonNull(configInstance.getOCRLanguages())) {
|
||||||
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
|
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user