mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-20 03:24:55 +00:00
Merge branch 'develop' of github.com:sleuthkit/autopsy into develop
This commit is contained in:
commit
d11d578c1c
@ -87,12 +87,28 @@ public class EncodingUtils {
|
||||
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(stream);
|
||||
CharsetMatch tikaResult = detector.detect();
|
||||
if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
|
||||
String tikaCharSet = tikaResult.getName();
|
||||
//Check if the nio package has support for the charset determined by Tika.
|
||||
if(Charset.isSupported(tikaCharSet)) {
|
||||
return Charset.forName(tikaCharSet);
|
||||
|
||||
CharsetMatch[] tikaResults = detector.detectAll();
|
||||
// Get all guesses by Tika. These matches are ordered
|
||||
// by descending confidence (largest first).
|
||||
if (tikaResults.length > 0) {
|
||||
CharsetMatch topPick = tikaResults[0];
|
||||
|
||||
if (topPick.getName().equalsIgnoreCase("IBM500") && tikaResults.length > 1) {
|
||||
// Legacy encoding, let's discard this one in favor
|
||||
// of the second pick. Tika has some problems with
|
||||
// mistakenly identifying text as IBM500. See JIRA-6600
|
||||
// and https://issues.apache.org/jira/browse/TIKA-2771 for
|
||||
// more details.
|
||||
topPick = tikaResults[1];
|
||||
}
|
||||
|
||||
if (!topPick.getName().equalsIgnoreCase("IBM500") &&
|
||||
topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE &&
|
||||
Charset.isSupported(topPick.getName())) {
|
||||
// Choose this charset since it's supported and has high
|
||||
// enough confidence
|
||||
return Charset.forName(topPick.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user