Only turn off embedded extraction for known mime-types

2025-07-19 11:07:43 +00:00 · 2019-04-30 12:28:04 -04:00 · 2019-04-30 12:28:04 -04:00 · 35e3934816
commit 35e3934816
parent 8a846b4937
1 changed files with 30 additions and 16 deletions
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@ -120,6 +120,16 @@ final class TikaTextExtractor implements TextExtractor {
                    "application/x-z", //NON-NLS
                    "application/x-compress"); //NON-NLS

+    //Tika should ignore types with embedded files that can be handled by the unpacking modules
+    private static final List<String> EMBEDDED_FILE_MIME_TYPES
+            = ImmutableList.of("application/msword", //NON-NLS
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
+                    "application/vnd.ms-powerpoint", //NON-NLS
+                    "application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
+                    "application/vnd.ms-excel", //NON-NLS
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
+                    "application/pdf"); //NON-NLS
+
    private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
    private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());

@ -152,8 +162,8 @@ final class TikaTextExtractor implements TextExtractor {

    /**
     * If Tesseract has been installed and is set to be used through
-     * configuration, then ocr is enabled. OCR can only currently be run on
-     * 64 bit Windows OS.
+     * configuration, then ocr is enabled. OCR can only currently be run on 64
+     * bit Windows OS.
     *
     * @return Flag indicating if OCR is set to be used.
     */
@ -179,9 +189,13 @@ final class TikaTextExtractor implements TextExtractor {

        ParseContext parseContext = new ParseContext();

-        //Disable appending embedded file text to output 
+        //Disable appending embedded file text to output for EFE supported types
        //JIRA-4975
+        if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
            parseContext.set(Parser.class, new EmptyParser());
+        } else {
+            parseContext.set(Parser.class, parser);
+        }

        if (ocrEnabled() && content instanceof AbstractFile) {
            AbstractFile file = ((AbstractFile) content);
@ -430,11 +444,11 @@ final class TikaTextExtractor implements TextExtractor {
     */
    @Override
    public boolean isSupported() {
-        if(!(content instanceof AbstractFile)) {
+        if (!(content instanceof AbstractFile)) {
            return false;
        }

-        String detectedType = ((AbstractFile)content).getMIMEType();
+        String detectedType = ((AbstractFile) content).getMIMEType();
        if (detectedType == null
                || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
                || ARCHIVE_MIME_TYPES.contains(detectedType)
@ -493,11 +507,11 @@ final class TikaTextExtractor implements TextExtractor {
        if (context != null) {
            ImageConfig configInstance = context.lookup(ImageConfig.class);
            if (configInstance != null) {
-                if(Objects.nonNull(configInstance.getOCREnabled())) {
+                if (Objects.nonNull(configInstance.getOCREnabled())) {
                    this.tesseractOCREnabled = configInstance.getOCREnabled();
                }

-                if(Objects.nonNull(configInstance.getOCRLanguages())) {
+                if (Objects.nonNull(configInstance.getOCRLanguages())) {
                    this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
                }
            }