mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-12 16:06:15 +00:00
Better fast English only string extraction using AbstractFileStringStream (does not break words)
This commit is contained in:
parent
29ae3c4601
commit
e981a8519f
@ -34,9 +34,9 @@ import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.S
|
||||
* Currently supports UTF-16 LE, UTF-16 BE and UTF8 Latin, Cyrillic, Chinese,
|
||||
* Arabic
|
||||
*
|
||||
* TODO: - process control characters - testing: check non-printable common
|
||||
* chars sometimes extracted (font?) - handle tie better (when number of chars
|
||||
* in result is equal)
|
||||
* TODO: process control characters
|
||||
*
|
||||
* TODO: handle tie better (when number of chars in 2 results is equal)
|
||||
*/
|
||||
public class StringExtract {
|
||||
|
||||
@ -179,16 +179,6 @@ public class StringExtract {
|
||||
return new StringExtractResult();
|
||||
}
|
||||
|
||||
//special case for Latin basic, use ASCII (fast extraction)
|
||||
if (isExtractionLatinBasicOnly()) {
|
||||
String extrStr = extractASCII(buff, len, offset);
|
||||
StringExtractResult res = new StringExtractResult();
|
||||
res.numBytes = len;
|
||||
res.numChars = extrStr.length();
|
||||
res.textString = extrStr;
|
||||
return res;
|
||||
}
|
||||
|
||||
final int buffLen = buff.length;
|
||||
|
||||
int processedBytes = 0;
|
||||
|
@ -109,17 +109,29 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
||||
this.numChunks = 0; //unknown until indexing is done
|
||||
boolean success = false;
|
||||
|
||||
//construct stream that extracts text as we read it
|
||||
//final InputStream stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
|
||||
|
||||
|
||||
final boolean extractUTF8 =
|
||||
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
|
||||
final boolean extractUTF16 =
|
||||
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
final InputStream stringStream = new AbstractFileStringIntStream(
|
||||
if (extractUTF8 == false && extractUTF16 == false) {
|
||||
//nothing to do
|
||||
return true;
|
||||
}
|
||||
|
||||
InputStream stringStream = null;
|
||||
//check which extract stream to use
|
||||
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1) ) {
|
||||
//optimal for english, english only
|
||||
stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
|
||||
}
|
||||
else {
|
||||
stringStream = new AbstractFileStringIntStream(
|
||||
sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
success = true;
|
||||
|
Loading…
x
Reference in New Issue
Block a user