Revert "Optimization and Bug fix - Make Tika use AbstractFile mimetype instead of recomputing"

This commit is contained in:
Richard Cordovano 2019-02-13 15:43:04 -05:00 committed by GitHub
parent 417659bb84
commit 3c3d92f03c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -50,7 +50,6 @@ import org.apache.tika.parser.ParsingReader;
import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.mime.MediaType;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.modules.InstalledFileLocator; import org.openide.modules.InstalledFileLocator;
import org.openide.util.Lookup; import org.openide.util.Lookup;
@ -126,7 +125,7 @@ final class TikaTextExtractor implements TextExtractor {
private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory); private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private final AutoDetectParser parser; private final AutoDetectParser parser = new AutoDetectParser();
private final Content content; private final Content content;
private boolean tesseractOCREnabled; private boolean tesseractOCREnabled;
@ -146,23 +145,12 @@ final class TikaTextExtractor implements TextExtractor {
public TikaTextExtractor(Content content) { public TikaTextExtractor(Content content) {
this.content = content; this.content = content;
parser = new AutoDetectParser();
if (content instanceof AbstractFile) {
AbstractFile file = (AbstractFile) content;
if (file.getMIMEType() != null && !file.getMIMEType().isEmpty()) {
//Force Tika to use our pre-computed mime type during detection
parser.setDetector((InputStream inStream, Metadata metaData)
-> MediaType.parse(file.getMIMEType()));
}
}
} }
/** /**
* If Tesseract has been installed and is set to be used through * If Tesseract has been installed and is set to be used through
* configuration, then ocr is enabled. OCR can only currently be run on 64 * configuration, then ocr is enabled. OCR can only currently be run on
* bit Windows OS. * 64 bit Windows OS.
* *
* @return Flag indicating if OCR is set to be used. * @return Flag indicating if OCR is set to be used.
*/ */