Revert "Optimization and Bug fix - Make Tika use AbstractFile mimetype instead of recomputing"

This commit is contained in:
Richard Cordovano 2019-02-13 15:43:04 -05:00 committed by GitHub
parent 417659bb84
commit 3c3d92f03c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -50,7 +50,6 @@ import org.apache.tika.parser.ParsingReader;
import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.mime.MediaType;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.modules.InstalledFileLocator; import org.openide.modules.InstalledFileLocator;
import org.openide.util.Lookup; import org.openide.util.Lookup;
@ -126,7 +125,7 @@ final class TikaTextExtractor implements TextExtractor {
private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory); private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private final AutoDetectParser parser; private final AutoDetectParser parser = new AutoDetectParser();
private final Content content; private final Content content;
private boolean tesseractOCREnabled; private boolean tesseractOCREnabled;
@ -146,23 +145,12 @@ final class TikaTextExtractor implements TextExtractor {
public TikaTextExtractor(Content content) { public TikaTextExtractor(Content content) {
this.content = content; this.content = content;
parser = new AutoDetectParser();
if (content instanceof AbstractFile) {
AbstractFile file = (AbstractFile) content;
if (file.getMIMEType() != null && !file.getMIMEType().isEmpty()) {
//Force Tika to use our pre-computed mime type during detection
parser.setDetector((InputStream inStream, Metadata metaData)
-> MediaType.parse(file.getMIMEType()));
}
}
} }
/** /**
* If Tesseract has been installed and is set to be used through * If Tesseract has been installed and is set to be used through
* configuration, then ocr is enabled. OCR can only currently be run on 64 * configuration, then ocr is enabled. OCR can only currently be run on
* bit Windows OS. * 64 bit Windows OS.
* *
* @return Flag indicating if OCR is set to be used. * @return Flag indicating if OCR is set to be used.
*/ */
@ -434,11 +422,11 @@ final class TikaTextExtractor implements TextExtractor {
*/ */
@Override @Override
public boolean isSupported() { public boolean isSupported() {
if (!(content instanceof AbstractFile)) { if(!(content instanceof AbstractFile)) {
return false; return false;
} }
String detectedType = ((AbstractFile) content).getMIMEType(); String detectedType = ((AbstractFile)content).getMIMEType();
if (detectedType == null if (detectedType == null
|| BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used) || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
|| ARCHIVE_MIME_TYPES.contains(detectedType) || ARCHIVE_MIME_TYPES.contains(detectedType)
@ -497,11 +485,11 @@ final class TikaTextExtractor implements TextExtractor {
if (context != null) { if (context != null) {
ImageConfig configInstance = context.lookup(ImageConfig.class); ImageConfig configInstance = context.lookup(ImageConfig.class);
if (configInstance != null) { if (configInstance != null) {
if (Objects.nonNull(configInstance.getOCREnabled())) { if(Objects.nonNull(configInstance.getOCREnabled())) {
this.tesseractOCREnabled = configInstance.getOCREnabled(); this.tesseractOCREnabled = configInstance.getOCREnabled();
} }
if (Objects.nonNull(configInstance.getOCRLanguages())) { if(Objects.nonNull(configInstance.getOCRLanguages())) {
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages()); this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
} }
} }