- add more archive extensions to skip

- better code to parse file extension - do not call string extractor if it does not support the content
2025-07-06 21:00:22 +00:00 · 2013-01-23 12:23:12 -05:00 · 2013-01-23 12:23:12 -05:00 · c10872e42e
commit c10872e42e
parent 455e4f6f89
3 changed files with 29 additions and 19 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
@ -54,8 +54,9 @@ class AbstractFileStringExtract implements AbstractFileExtract {
    //string extractor extracts from all other than archives
    //TODO use content type detection mechanism
    static final String[] UNSUPPORTED_EXTENSIONS = {
-        //Archives (
-        "tar", "jar", "zip", "gzip", "bzip2", "gz", "tgz",};
+        //Archives 
+        //Note: archive unpacker module will process these instead
+        "tar", "jar", "zip", "7z", "gzip", "bzip", "bzip2", "gz", "tgz", "cab", "rar", "arj", "dmg", "iso"};

    //disabled prepending of BOM
    //static {
@ -185,8 +186,13 @@ class AbstractFileStringExtract implements AbstractFileExtract {
    @Override
    public boolean isSupported(AbstractFile file) {
        String fileNameLower = file.getName().toLowerCase();
+        int dotI = fileNameLower.lastIndexOf(".");
+        if (dotI == -1 || dotI == fileNameLower.length() - 1) {
+            return true; //no extension
+        }
+        final String extension = fileNameLower.substring(dotI + 1);
        for (int i = 0; i < UNSUPPORTED_EXTENSIONS.length; ++i) {
-            if (fileNameLower.endsWith(UNSUPPORTED_EXTENSIONS[i])) {
+            if (extension.equals(UNSUPPORTED_EXTENSIONS[i])) {
                return false;
            }
        }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
@ -71,16 +71,16 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    static final String[] SUPPORTED_EXTENSIONS = {
        //MS Office
        "doc", "dot", "docx", "docm", "dotx", "dotm",
-        "xls", "xlw", "xlt", "xlsx",  "xlsm", "xltx", "xltm",
-        "ppt", "pps", "pot", "pptx", "pptm", "potx", "potm",  
+        "xls", "xlw", "xlt", "xlsx", "xlsm", "xltx", "xltm",
+        "ppt", "pps", "pot", "pptx", "pptm", "potx", "potm",
        //Open Office
-        "odf", "odt", "ott", "ods", "ots", "odp", "otp", 
-        "sxw", "stw", "sxc", "stc", "sxi", "sxi", 
-        "sdw", "sdc", "vor", "sgl", 
+        "odf", "odt", "ott", "ods", "ots", "odp", "otp",
+        "sxw", "stw", "sxc", "stc", "sxi", "sxi",
+        "sdw", "sdc", "vor", "sgl",
        //rich text, pdf
-        "rtf", "pdf", 
+        "rtf", "pdf",
        //html (other extractors take priority)
-        "html", "htm", "xhtml", 
+        "html", "htm", "xhtml",
        //text
        "txt", "log", "manifest",
        //images, media, other
@ -90,7 +90,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    AbstractFileTikaTextExtract() {
        this.module = KeywordSearchIngestModule.getDefault();
        ingester = Server.getIngester();
-        
+
    }

    @Override
@ -102,15 +102,14 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
        return null;
    }
-    
-        @Override
+
+    @Override
    public Map<String, String> getOptions() {
        return null;
    }

    @Override
    public void setOptions(Map<String, String> options) {
-
    }

    @Override
@ -281,8 +280,13 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    @Override
    public boolean isSupported(AbstractFile file) {
        String fileNameLower = file.getName().toLowerCase();
+        int dotI = fileNameLower.lastIndexOf(".");
+        if (dotI == -1 || dotI == fileNameLower.length() - 1) {
+            return false; //no extension
+        }
+        final String extension = fileNameLower.substring(dotI + 1);
        for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) {
-            if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
+            if (extension.equals(SUPPORTED_EXTENSIONS[i])) {
                return true;
            }
        }
@ -318,8 +322,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
                KeywordSearch.getTikaLogger().log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
                tika = null;
                reader = null;
-            }
-             catch (Exception ex) {
+            } catch (Exception ex) {
                KeywordSearch.getTikaLogger().log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
                tika = null;
                reader = null;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -696,7 +696,7 @@ public final class KeywordSearchIngestModule implements IngestModuleAbstractFile
        private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
            AbstractFileExtract fileExtract = null;

-            if (stringsOnly) {
+            if (stringsOnly && stringExtractor.isSupported(aFile)) {
                fileExtract = stringExtractor;
            } else {
                //go over available text extractors and pick the first one (most specific one)
@ -709,7 +709,8 @@ public final class KeywordSearchIngestModule implements IngestModuleAbstractFile
            }

            if (fileExtract == null) {
-                throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
+                logger.log(Level.INFO, "No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
+                return false;
            }

            //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());