Revert "Optimization and Bug fix - Make Tika use AbstractFile mimetype instead of recomputing"

This commit is contained in:
Richard Cordovano 2019-02-13 15:43:04 -05:00 committed by GitHub
parent 417659bb84
commit 3c3d92f03c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -50,7 +50,6 @@ import org.apache.tika.parser.ParsingReader;
import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.mime.MediaType;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.modules.InstalledFileLocator; import org.openide.modules.InstalledFileLocator;
import org.openide.util.Lookup; import org.openide.util.Lookup;
@ -126,7 +125,7 @@ final class TikaTextExtractor implements TextExtractor {
private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory); private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private final AutoDetectParser parser; private final AutoDetectParser parser = new AutoDetectParser();
private final Content content; private final Content content;
private boolean tesseractOCREnabled; private boolean tesseractOCREnabled;
@ -135,7 +134,7 @@ final class TikaTextExtractor implements TextExtractor {
private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final File TESSERACT_PATH = locateTesseractExecutable();
private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks()); private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
private ProcessTerminator processTerminator; private ProcessTerminator processTerminator;
private static final List<String> TIKA_SUPPORTED_TYPES private static final List<String> TIKA_SUPPORTED_TYPES
@ -146,23 +145,12 @@ final class TikaTextExtractor implements TextExtractor {
public TikaTextExtractor(Content content) { public TikaTextExtractor(Content content) {
this.content = content; this.content = content;
parser = new AutoDetectParser();
if (content instanceof AbstractFile) {
AbstractFile file = (AbstractFile) content;
if (file.getMIMEType() != null && !file.getMIMEType().isEmpty()) {
//Force Tika to use our pre-computed mime type during detection
parser.setDetector((InputStream inStream, Metadata metaData)
-> MediaType.parse(file.getMIMEType()));
}
}
} }
/** /**
* If Tesseract has been installed and is set to be used through * If Tesseract has been installed and is set to be used through
* configuration, then ocr is enabled. OCR can only currently be run on 64 * configuration, then ocr is enabled. OCR can only currently be run on
* bit Windows OS. * 64 bit Windows OS.
* *
* @return Flag indicating if OCR is set to be used. * @return Flag indicating if OCR is set to be used.
*/ */
@ -211,7 +199,7 @@ final class TikaTextExtractor implements TextExtractor {
TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
String tesseractFolder = TESSERACT_PATH.getParent(); String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder); ocrConfig.setTesseractPath(tesseractFolder);
ocrConfig.setLanguage(languagePacks); ocrConfig.setLanguage(languagePacks);
ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
parseContext.set(TesseractOCRConfig.class, ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig);
@ -281,7 +269,7 @@ final class TikaTextExtractor implements TextExtractor {
File outputFile = null; File outputFile = null;
try { try {
String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory(); String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
//Appending file id makes the name unique //Appending file id makes the name unique
String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName()); String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
inputFile = Paths.get(tempDirectory, tempFileName).toFile(); inputFile = Paths.get(tempDirectory, tempFileName).toFile();
@ -322,7 +310,7 @@ final class TikaTextExtractor implements TextExtractor {
} }
} }
} }
/** /**
* Wraps the creation of a TikaReader into a Future so that it can be * Wraps the creation of a TikaReader into a Future so that it can be
* cancelled. * cancelled.
@ -434,11 +422,11 @@ final class TikaTextExtractor implements TextExtractor {
*/ */
@Override @Override
public boolean isSupported() { public boolean isSupported() {
if (!(content instanceof AbstractFile)) { if(!(content instanceof AbstractFile)) {
return false; return false;
} }
String detectedType = ((AbstractFile) content).getMIMEType(); String detectedType = ((AbstractFile)content).getMIMEType();
if (detectedType == null if (detectedType == null
|| BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used) || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
|| ARCHIVE_MIME_TYPES.contains(detectedType) || ARCHIVE_MIME_TYPES.contains(detectedType)
@ -447,7 +435,7 @@ final class TikaTextExtractor implements TextExtractor {
) { ) {
return false; return false;
} }
return TIKA_SUPPORTED_TYPES.contains(detectedType); return TIKA_SUPPORTED_TYPES.contains(detectedType);
} }
@ -497,11 +485,11 @@ final class TikaTextExtractor implements TextExtractor {
if (context != null) { if (context != null) {
ImageConfig configInstance = context.lookup(ImageConfig.class); ImageConfig configInstance = context.lookup(ImageConfig.class);
if (configInstance != null) { if (configInstance != null) {
if (Objects.nonNull(configInstance.getOCREnabled())) { if(Objects.nonNull(configInstance.getOCREnabled())) {
this.tesseractOCREnabled = configInstance.getOCREnabled(); this.tesseractOCREnabled = configInstance.getOCREnabled();
} }
if (Objects.nonNull(configInstance.getOCRLanguages())) { if(Objects.nonNull(configInstance.getOCRLanguages())) {
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages()); this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
} }
} }