Revert "Optimization and Bug fix - Make Tika use AbstractFile mimetype instead of recomputing"

This commit is contained in:
Richard Cordovano 2019-02-13 15:43:04 -05:00 committed by GitHub
parent 417659bb84
commit 3c3d92f03c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -50,7 +50,6 @@ import org.apache.tika.parser.ParsingReader;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.mime.MediaType;
import org.openide.util.NbBundle;
import org.openide.modules.InstalledFileLocator;
import org.openide.util.Lookup;
@ -126,7 +125,7 @@ final class TikaTextExtractor implements TextExtractor {
private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private final AutoDetectParser parser;
private final AutoDetectParser parser = new AutoDetectParser();
private final Content content;
private boolean tesseractOCREnabled;
@ -146,23 +145,12 @@ final class TikaTextExtractor implements TextExtractor {
public TikaTextExtractor(Content content) {
this.content = content;
parser = new AutoDetectParser();
if (content instanceof AbstractFile) {
AbstractFile file = (AbstractFile) content;
if (file.getMIMEType() != null && !file.getMIMEType().isEmpty()) {
//Force Tika to use our pre-computed mime type during detection
parser.setDetector((InputStream inStream, Metadata metaData)
-> MediaType.parse(file.getMIMEType()));
}
}
}
/**
* If Tesseract has been installed and is set to be used through
* configuration, then ocr is enabled. OCR can only currently be run on 64
* bit Windows OS.
* configuration, then ocr is enabled. OCR can only currently be run on
* 64 bit Windows OS.
*
* @return Flag indicating if OCR is set to be used.
*/