mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Merge pull request #5536 from sleuthkit/release-4.14.0
Merge release-4.14.0 branch onto develop branch
This commit is contained in:
commit
335b0791cd
@ -254,10 +254,10 @@ public class FileTypeDetector {
|
||||
} else {
|
||||
/*
|
||||
* If the file was marked as an octet stream and the extension is .txt, try to detect a text
|
||||
* encoding with Decodetect.
|
||||
* encoding
|
||||
*/
|
||||
if (file.getNameExtension().equals("txt")) {
|
||||
Charset detectedCharset = TextFileExtractor.getEncoding(file);
|
||||
Charset detectedCharset = new TextFileExtractor(file).getEncoding();
|
||||
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
|
||||
mimeType = MimeTypes.PLAIN_TEXT;
|
||||
}
|
||||
|
@ -85,14 +85,14 @@ public class TextExtractorFactory {
|
||||
* @param content AbstractFile content
|
||||
* @param context Lookup containing extractor configurations
|
||||
*
|
||||
* @return
|
||||
* @return List of all extractors in priority order. Not all will support the passed in content. @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED
|
||||
*/
|
||||
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
|
||||
List<TextExtractor> fileExtractors = Arrays.asList(
|
||||
new TextFileExtractor(content),
|
||||
new HtmlTextExtractor(content),
|
||||
new SqliteTextExtractor(content),
|
||||
new TikaTextExtractor(content));
|
||||
new TikaTextExtractor(content)); /// This should go last to ensure the more specific ones are picked first.
|
||||
|
||||
fileExtractors.forEach((fileExtractor) -> {
|
||||
fileExtractor.setExtractionSettings(context);
|
||||
|
@ -31,17 +31,24 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
import java.util.List;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.tika.parser.txt.CharsetDetector;
|
||||
import org.apache.tika.parser.txt.CharsetMatch;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
* Extract text from text files
|
||||
* A TextExtractor that is used to extract text from a text file.
|
||||
*/
|
||||
public final class TextFileExtractor implements TextExtractor {
|
||||
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
|
||||
|
||||
/*
|
||||
* The char set returned if a text file extractor fails to detect the
|
||||
* encoding of the file from which it is extracting text.
|
||||
*/
|
||||
public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
|
||||
@Override
|
||||
public boolean contains(Charset cs) {
|
||||
return false;
|
||||
@ -59,33 +66,45 @@ public final class TextFileExtractor implements TextExtractor {
|
||||
};
|
||||
|
||||
// This value will be used as a threshold for determining which encoding
|
||||
// detection library to use. If Tika's own confidence is at least
|
||||
// MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
|
||||
// detection library to use. If CharsetDetector's own confidence is at least
|
||||
// MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
|
||||
// Otherwise, Decodetect will be used.
|
||||
static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
|
||||
//
|
||||
// Note: We initially used a confidence of 35, but it was causing some
|
||||
// Chrome Cache files to get flagged as UTF-16 with confidence 40.
|
||||
// These files had a small amount of binary data and then ASCII.
|
||||
static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
|
||||
|
||||
// This value determines whether we will consider Decodetect's top-scoring
|
||||
// result a legitimate match or if we will disregard its findings
|
||||
// result a legitimate match or if we will disregard its findings.
|
||||
//
|
||||
// Possible values are 0 to 1, inclusive
|
||||
// Possible values are 0 to 1, inclusive.
|
||||
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
|
||||
|
||||
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
||||
private final AbstractFile file;
|
||||
|
||||
private Charset encoding = null;
|
||||
|
||||
/**
|
||||
* Constructs a TextExtractor that is used to extract text from a text file.
|
||||
*
|
||||
* @param file The file.
|
||||
*/
|
||||
public TextFileExtractor(AbstractFile file) {
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() {
|
||||
Charset encoding = getEncoding(file);
|
||||
if (encoding.equals(UNKNOWN_CHARSET)) {
|
||||
encoding = StandardCharsets.UTF_8;
|
||||
Charset enc = getEncoding();
|
||||
if (enc.equals(UNKNOWN_CHARSET)) {
|
||||
enc = StandardCharsets.UTF_8;
|
||||
}
|
||||
return getReader(encoding);
|
||||
return getReader(enc);
|
||||
}
|
||||
|
||||
public Reader getReader(Charset encoding) {
|
||||
private Reader getReader(Charset encoding) {
|
||||
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
|
||||
}
|
||||
|
||||
@ -94,33 +113,60 @@ public final class TextFileExtractor implements TextExtractor {
|
||||
return file.getMIMEType().equals("text/plain");
|
||||
}
|
||||
|
||||
public static Charset getEncoding(Content content) {
|
||||
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
|
||||
// Tika first
|
||||
/**
|
||||
* Returns the encoding of the file.
|
||||
*
|
||||
* @return Detected encoding or UNKNOWN_CHARSET.
|
||||
*/
|
||||
public Charset getEncoding() {
|
||||
if (encoding != null) {
|
||||
return encoding;
|
||||
}
|
||||
|
||||
// Encoding detection is hard. We use several libraries since the data passed in is often messy.
|
||||
// First try CharsetDetector (from Tika / ICU4J).
|
||||
// It is a rule-based detection approach.
|
||||
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(stream);
|
||||
CharsetMatch tikaResult = detector.detect();
|
||||
if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
|
||||
if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
|
||||
try {
|
||||
return Charset.forName(tikaResult.getName());
|
||||
} catch (UnsupportedCharsetException ignored) {
|
||||
encoding = Charset.forName(tikaResult.getName());
|
||||
return encoding;
|
||||
} catch (UnsupportedCharsetException ex) {
|
||||
logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex);
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex);
|
||||
}
|
||||
|
||||
// Decodetect if Tika fails or falls below confidence threshold
|
||||
// If that did not work, then use DecoDetect, which is stastical
|
||||
// We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
|
||||
// This will not always work with messy data that combines some binary and some ASCII.
|
||||
try {
|
||||
int maxBytes = 100000;
|
||||
int numBytes = Math.min(stream.available(), maxBytes);
|
||||
int numBytes = maxBytes;
|
||||
if (file.getSize() < maxBytes) {
|
||||
numBytes = (int) file.getSize();
|
||||
}
|
||||
|
||||
byte[] targetArray = new byte[numBytes];
|
||||
stream.read(targetArray);
|
||||
file.read(targetArray, 0, numBytes);
|
||||
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
|
||||
if (!results.isEmpty()) {
|
||||
DecodetectResult topResult = results.get(0);
|
||||
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
|
||||
return topResult.getEncoding();
|
||||
encoding = topResult.getEncoding();
|
||||
return encoding;
|
||||
}
|
||||
}
|
||||
} catch (IOException ignored) {
|
||||
} catch (TskCoreException ex) {
|
||||
logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex);
|
||||
}
|
||||
return UNKNOWN_CHARSET;
|
||||
|
||||
encoding = UNKNOWN_CHARSET;
|
||||
return encoding;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user