Merge branch 'develop' of github.com:sleuthkit/autopsy into develop

2025-07-20 03:24:55 +00:00 · 2020-07-20 13:08:57 -04:00 · 2020-07-20 13:08:57 -04:00 · d11d578c1c
commit d11d578c1c
parent 1cfa78d2fe f188575f74
1 changed files with 22 additions and 6 deletions
--- a/Core/src/org/sleuthkit/autopsy/coreutils/textutils/EncodingUtils.java
+++ b/Core/src/org/sleuthkit/autopsy/coreutils/textutils/EncodingUtils.java
@ -87,12 +87,28 @@ public class EncodingUtils {
        try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
            CharsetDetector detector = new CharsetDetector();
            detector.setText(stream);
-            CharsetMatch tikaResult = detector.detect();
-            if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
-                String tikaCharSet = tikaResult.getName();
-                //Check if the nio package has support for the charset determined by Tika.
-                if(Charset.isSupported(tikaCharSet)) {
-                    return Charset.forName(tikaCharSet);
+            
+            CharsetMatch[] tikaResults = detector.detectAll();
+            // Get all guesses by Tika. These matches are ordered
+            // by descending confidence (largest first).
+            if (tikaResults.length > 0) {
+                CharsetMatch topPick = tikaResults[0];
+                
+                if (topPick.getName().equalsIgnoreCase("IBM500") && tikaResults.length > 1) {
+                    // Legacy encoding, let's discard this one in favor
+                    // of the second pick. Tika has some problems with 
+                    // mistakenly identifying text as IBM500. See JIRA-6600 
+                    // and https://issues.apache.org/jira/browse/TIKA-2771 for 
+                    // more details.
+                    topPick = tikaResults[1];
+                }
+                
+                if (!topPick.getName().equalsIgnoreCase("IBM500") && 
+                        topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE &&
+                        Charset.isSupported(topPick.getName())) {
+                    // Choose this charset since it's supported and has high 
+                    // enough confidence
+                    return Charset.forName(topPick.getName());
                }
            }
        }