Merge branch 'develop' of github.com:sleuthkit/autopsy into develop

This commit is contained in:
esaunders 2020-07-20 13:08:57 -04:00
commit d11d578c1c

View File

@ -87,12 +87,28 @@ public class EncodingUtils {
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
CharsetDetector detector = new CharsetDetector();
detector.setText(stream);
CharsetMatch tikaResult = detector.detect();
if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
String tikaCharSet = tikaResult.getName();
//Check if the nio package has support for the charset determined by Tika.
if(Charset.isSupported(tikaCharSet)) {
return Charset.forName(tikaCharSet);
CharsetMatch[] tikaResults = detector.detectAll();
// Get all guesses by Tika. These matches are ordered
// by descending confidence (largest first).
if (tikaResults.length > 0) {
CharsetMatch topPick = tikaResults[0];
if (topPick.getName().equalsIgnoreCase("IBM500") && tikaResults.length > 1) {
// Legacy encoding, let's discard this one in favor
// of the second pick. Tika has some problems with
// mistakenly identifying text as IBM500. See JIRA-6600
// and https://issues.apache.org/jira/browse/TIKA-2771 for
// more details.
topPick = tikaResults[1];
}
if (!topPick.getName().equalsIgnoreCase("IBM500") &&
topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE &&
Charset.isSupported(topPick.getName())) {
// Choose this charset since it's supported and has high
// enough confidence
return Charset.forName(topPick.getName());
}
}
}