mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Merge pull request #5536 from sleuthkit/release-4.14.0
Merge release-4.14.0 branch onto develop branch
This commit is contained in:
commit
335b0791cd
@ -254,10 +254,10 @@ public class FileTypeDetector {
|
|||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* If the file was marked as an octet stream and the extension is .txt, try to detect a text
|
* If the file was marked as an octet stream and the extension is .txt, try to detect a text
|
||||||
* encoding with Decodetect.
|
* encoding
|
||||||
*/
|
*/
|
||||||
if (file.getNameExtension().equals("txt")) {
|
if (file.getNameExtension().equals("txt")) {
|
||||||
Charset detectedCharset = TextFileExtractor.getEncoding(file);
|
Charset detectedCharset = new TextFileExtractor(file).getEncoding();
|
||||||
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
|
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
|
||||||
mimeType = MimeTypes.PLAIN_TEXT;
|
mimeType = MimeTypes.PLAIN_TEXT;
|
||||||
}
|
}
|
||||||
|
@ -85,14 +85,14 @@ public class TextExtractorFactory {
|
|||||||
* @param content AbstractFile content
|
* @param content AbstractFile content
|
||||||
* @param context Lookup containing extractor configurations
|
* @param context Lookup containing extractor configurations
|
||||||
*
|
*
|
||||||
* @return
|
* @return List of all extractors in priority order. Not all will support the passed in content. @@@ PERHAPS ONLY SUPPORTED SHOULD BE RETURNED
|
||||||
*/
|
*/
|
||||||
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
|
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
|
||||||
List<TextExtractor> fileExtractors = Arrays.asList(
|
List<TextExtractor> fileExtractors = Arrays.asList(
|
||||||
new TextFileExtractor(content),
|
new TextFileExtractor(content),
|
||||||
new HtmlTextExtractor(content),
|
new HtmlTextExtractor(content),
|
||||||
new SqliteTextExtractor(content),
|
new SqliteTextExtractor(content),
|
||||||
new TikaTextExtractor(content));
|
new TikaTextExtractor(content)); /// This should go last to ensure the more specific ones are picked first.
|
||||||
|
|
||||||
fileExtractors.forEach((fileExtractor) -> {
|
fileExtractors.forEach((fileExtractor) -> {
|
||||||
fileExtractor.setExtractionSettings(context);
|
fileExtractor.setExtractionSettings(context);
|
||||||
|
@ -31,17 +31,24 @@ import java.nio.charset.CharsetEncoder;
|
|||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.charset.UnsupportedCharsetException;
|
import java.nio.charset.UnsupportedCharsetException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.logging.Level;
|
||||||
import org.apache.tika.parser.txt.CharsetDetector;
|
import org.apache.tika.parser.txt.CharsetDetector;
|
||||||
import org.apache.tika.parser.txt.CharsetMatch;
|
import org.apache.tika.parser.txt.CharsetMatch;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract text from text files
|
* A TextExtractor that is used to extract text from a text file.
|
||||||
*/
|
*/
|
||||||
public final class TextFileExtractor implements TextExtractor {
|
public final class TextFileExtractor implements TextExtractor {
|
||||||
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
|
|
||||||
|
/*
|
||||||
|
* The char set returned if a text file extractor fails to detect the
|
||||||
|
* encoding of the file from which it is extracting text.
|
||||||
|
*/
|
||||||
|
public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(Charset cs) {
|
public boolean contains(Charset cs) {
|
||||||
return false;
|
return false;
|
||||||
@ -59,33 +66,45 @@ public final class TextFileExtractor implements TextExtractor {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// This value will be used as a threshold for determining which encoding
|
// This value will be used as a threshold for determining which encoding
|
||||||
// detection library to use. If Tika's own confidence is at least
|
// detection library to use. If CharsetDetector's own confidence is at least
|
||||||
// MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
|
// MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
|
||||||
// Otherwise, Decodetect will be used.
|
// Otherwise, Decodetect will be used.
|
||||||
static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
|
//
|
||||||
|
// Note: We initially used a confidence of 35, but it was causing some
|
||||||
|
// Chrome Cache files to get flagged as UTF-16 with confidence 40.
|
||||||
|
// These files had a small amount of binary data and then ASCII.
|
||||||
|
static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
|
||||||
|
|
||||||
// This value determines whether we will consider Decodetect's top-scoring
|
// This value determines whether we will consider Decodetect's top-scoring
|
||||||
// result a legitimate match or if we will disregard its findings
|
// result a legitimate match or if we will disregard its findings.
|
||||||
//
|
//
|
||||||
// Possible values are 0 to 1, inclusive
|
// Possible values are 0 to 1, inclusive.
|
||||||
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
|
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
||||||
private final AbstractFile file;
|
private final AbstractFile file;
|
||||||
|
|
||||||
|
private Charset encoding = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a TextExtractor that is used to extract text from a text file.
|
||||||
|
*
|
||||||
|
* @param file The file.
|
||||||
|
*/
|
||||||
public TextFileExtractor(AbstractFile file) {
|
public TextFileExtractor(AbstractFile file) {
|
||||||
this.file = file;
|
this.file = file;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader() {
|
public Reader getReader() {
|
||||||
Charset encoding = getEncoding(file);
|
Charset enc = getEncoding();
|
||||||
if (encoding.equals(UNKNOWN_CHARSET)) {
|
if (enc.equals(UNKNOWN_CHARSET)) {
|
||||||
encoding = StandardCharsets.UTF_8;
|
enc = StandardCharsets.UTF_8;
|
||||||
}
|
}
|
||||||
return getReader(encoding);
|
return getReader(enc);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Reader getReader(Charset encoding) {
|
private Reader getReader(Charset encoding) {
|
||||||
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
|
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,33 +113,60 @@ public final class TextFileExtractor implements TextExtractor {
|
|||||||
return file.getMIMEType().equals("text/plain");
|
return file.getMIMEType().equals("text/plain");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Charset getEncoding(Content content) {
|
/**
|
||||||
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
|
* Returns the encoding of the file.
|
||||||
// Tika first
|
*
|
||||||
|
* @return Detected encoding or UNKNOWN_CHARSET.
|
||||||
|
*/
|
||||||
|
public Charset getEncoding() {
|
||||||
|
if (encoding != null) {
|
||||||
|
return encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encoding detection is hard. We use several libraries since the data passed in is often messy.
|
||||||
|
// First try CharsetDetector (from Tika / ICU4J).
|
||||||
|
// It is a rule-based detection approach.
|
||||||
|
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
|
||||||
CharsetDetector detector = new CharsetDetector();
|
CharsetDetector detector = new CharsetDetector();
|
||||||
detector.setText(stream);
|
detector.setText(stream);
|
||||||
CharsetMatch tikaResult = detector.detect();
|
CharsetMatch tikaResult = detector.detect();
|
||||||
if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
|
if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
|
||||||
try {
|
try {
|
||||||
return Charset.forName(tikaResult.getName());
|
encoding = Charset.forName(tikaResult.getName());
|
||||||
} catch (UnsupportedCharsetException ignored) {
|
return encoding;
|
||||||
|
} catch (UnsupportedCharsetException ex) {
|
||||||
|
logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex);
|
||||||
|
}
|
||||||
|
|
||||||
// Decodetect if Tika fails or falls below confidence threshold
|
// If that did not work, then use DecoDetect, which is stastical
|
||||||
|
// We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
|
||||||
|
// This will not always work with messy data that combines some binary and some ASCII.
|
||||||
|
try {
|
||||||
int maxBytes = 100000;
|
int maxBytes = 100000;
|
||||||
int numBytes = Math.min(stream.available(), maxBytes);
|
int numBytes = maxBytes;
|
||||||
|
if (file.getSize() < maxBytes) {
|
||||||
|
numBytes = (int) file.getSize();
|
||||||
|
}
|
||||||
|
|
||||||
byte[] targetArray = new byte[numBytes];
|
byte[] targetArray = new byte[numBytes];
|
||||||
stream.read(targetArray);
|
file.read(targetArray, 0, numBytes);
|
||||||
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
|
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
|
||||||
if (!results.isEmpty()) {
|
if (!results.isEmpty()) {
|
||||||
DecodetectResult topResult = results.get(0);
|
DecodetectResult topResult = results.get(0);
|
||||||
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
|
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
|
||||||
return topResult.getEncoding();
|
encoding = topResult.getEncoding();
|
||||||
|
return encoding;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException ignored) {
|
} catch (TskCoreException ex) {
|
||||||
|
logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex);
|
||||||
}
|
}
|
||||||
return UNKNOWN_CHARSET;
|
|
||||||
|
encoding = UNKNOWN_CHARSET;
|
||||||
|
return encoding;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user