Revert "Optimization and Bug fix - Make Tika use AbstractFile mimetype instead of recomputing"

2025-07-17 10:17:41 +00:00 · 2019-02-13 15:43:04 -05:00 · 2019-02-13 15:43:04 -05:00 · 3c3d92f03c
commit 3c3d92f03c
parent 417659bb84
1 changed files with 14 additions and 26 deletions
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@ -50,7 +50,6 @@ import org.apache.tika.parser.ParsingReader;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
-import org.apache.tika.mime.MediaType;
 import org.openide.util.NbBundle;
 import org.openide.modules.InstalledFileLocator;
 import org.openide.util.Lookup;
@ -126,7 +125,7 @@ final class TikaTextExtractor implements TextExtractor {
    private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
    private static final String SQLITE_MIMETYPE = "application/x-sqlite3";

-    private final AutoDetectParser parser;
+    private final AutoDetectParser parser = new AutoDetectParser();
    private final Content content;

    private boolean tesseractOCREnabled;
@ -135,7 +134,7 @@ final class TikaTextExtractor implements TextExtractor {
    private static final File TESSERACT_PATH = locateTesseractExecutable();
    private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
    private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
-
+    
    private ProcessTerminator processTerminator;

    private static final List<String> TIKA_SUPPORTED_TYPES
@ -146,23 +145,12 @@ final class TikaTextExtractor implements TextExtractor {

    public TikaTextExtractor(Content content) {
        this.content = content;
-
-        parser = new AutoDetectParser();
-
-        if (content instanceof AbstractFile) {
-            AbstractFile file = (AbstractFile) content;
-            if (file.getMIMEType() != null && !file.getMIMEType().isEmpty()) {
-                //Force Tika to use our pre-computed mime type during detection
-                parser.setDetector((InputStream inStream, Metadata metaData)
-                        -> MediaType.parse(file.getMIMEType()));
-            }
-        }
    }

    /**
     * If Tesseract has been installed and is set to be used through
-     * configuration, then ocr is enabled. OCR can only currently be run on 64
-     * bit Windows OS.
+     * configuration, then ocr is enabled. OCR can only currently be run on
+     * 64 bit Windows OS.
     *
     * @return Flag indicating if OCR is set to be used.
     */
@ -211,7 +199,7 @@ final class TikaTextExtractor implements TextExtractor {
                TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
                String tesseractFolder = TESSERACT_PATH.getParent();
                ocrConfig.setTesseractPath(tesseractFolder);
-
+                
                ocrConfig.setLanguage(languagePacks);
                ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
                parseContext.set(TesseractOCRConfig.class, ocrConfig);
@ -281,7 +269,7 @@ final class TikaTextExtractor implements TextExtractor {
        File outputFile = null;
        try {
            String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
-
+            
            //Appending file id makes the name unique
            String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
            inputFile = Paths.get(tempDirectory, tempFileName).toFile();
@ -322,7 +310,7 @@ final class TikaTextExtractor implements TextExtractor {
            }
        }
    }
-
+    
    /**
     * Wraps the creation of a TikaReader into a Future so that it can be
     * cancelled.
@ -434,11 +422,11 @@ final class TikaTextExtractor implements TextExtractor {
     */
    @Override
    public boolean isSupported() {
-        if (!(content instanceof AbstractFile)) {
+        if(!(content instanceof AbstractFile)) {
            return false;
        }
-
-        String detectedType = ((AbstractFile) content).getMIMEType();
+        
+        String detectedType = ((AbstractFile)content).getMIMEType();
        if (detectedType == null
                || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
                || ARCHIVE_MIME_TYPES.contains(detectedType)
@ -447,7 +435,7 @@ final class TikaTextExtractor implements TextExtractor {
                ) {
            return false;
        }
-
+        
        return TIKA_SUPPORTED_TYPES.contains(detectedType);
    }

@ -497,11 +485,11 @@ final class TikaTextExtractor implements TextExtractor {
        if (context != null) {
            ImageConfig configInstance = context.lookup(ImageConfig.class);
            if (configInstance != null) {
-                if (Objects.nonNull(configInstance.getOCREnabled())) {
+                if(Objects.nonNull(configInstance.getOCREnabled())) {
                    this.tesseractOCREnabled = configInstance.getOCREnabled();
                }
-
-                if (Objects.nonNull(configInstance.getOCRLanguages())) {
+                
+                if(Objects.nonNull(configInstance.getOCRLanguages())) {
                    this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
                }
            }