Revert "Extract inline images from PDF"

This commit is contained in:
Ann Priestman 2020-10-09 15:36:54 -04:00 committed by GitHub
parent 4b3e1ca271
commit 0692373bde
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -34,7 +34,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@ -77,17 +76,9 @@ final class PDFAttachmentExtractor {
ExtractionPreconditions.checkArgument(Files.exists(outputDir),
String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
pdfConfig.setExtractUniqueInlineImagesOnly(true);
parseContext.set(PDFParserConfig.class, pdfConfig);
//Keep track of the attachment files as they are being extracted and written to disk.
NewResourceWatcher watcher = new NewResourceWatcher();
parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));