diff --git a/Core/src/org/sleuthkit/autopsy/coreutils/textutils/EncodingUtils.java b/Core/src/org/sleuthkit/autopsy/coreutils/textutils/EncodingUtils.java index d0bf4a0e36..e4fc7019c9 100755 --- a/Core/src/org/sleuthkit/autopsy/coreutils/textutils/EncodingUtils.java +++ b/Core/src/org/sleuthkit/autopsy/coreutils/textutils/EncodingUtils.java @@ -87,12 +87,28 @@ public class EncodingUtils { try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) { CharsetDetector detector = new CharsetDetector(); detector.setText(stream); - CharsetMatch tikaResult = detector.detect(); - if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) { - String tikaCharSet = tikaResult.getName(); - //Check if the nio package has support for the charset determined by Tika. - if(Charset.isSupported(tikaCharSet)) { - return Charset.forName(tikaCharSet); + + CharsetMatch[] tikaResults = detector.detectAll(); + // Get all guesses by Tika. These matches are ordered + // by descending confidence (largest first). + if (tikaResults.length > 0) { + CharsetMatch topPick = tikaResults[0]; + + if (topPick.getName().equalsIgnoreCase("IBM500") && tikaResults.length > 1) { + // Legacy encoding, let's discard this one in favor + // of the second pick. Tika has some problems with + // mistakenly identifying text as IBM500. See JIRA-6600 + // and https://issues.apache.org/jira/browse/TIKA-2771 for + // more details. + topPick = tikaResults[1]; + } + + if (!topPick.getName().equalsIgnoreCase("IBM500") && + topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE && + Charset.isSupported(topPick.getName())) { + // Choose this charset since it's supported and has high + // enough confidence + return Charset.forName(topPick.getName()); } } }