mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-13 00:16:16 +00:00
Better fast English only string extraction using AbstractFileStringStream (does not break words)
This commit is contained in:
parent
29ae3c4601
commit
e981a8519f
@ -34,9 +34,9 @@ import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.S
|
|||||||
* Currently supports UTF-16 LE, UTF-16 BE and UTF8 Latin, Cyrillic, Chinese,
|
* Currently supports UTF-16 LE, UTF-16 BE and UTF8 Latin, Cyrillic, Chinese,
|
||||||
* Arabic
|
* Arabic
|
||||||
*
|
*
|
||||||
* TODO: - process control characters - testing: check non-printable common
|
* TODO: process control characters
|
||||||
* chars sometimes extracted (font?) - handle tie better (when number of chars
|
*
|
||||||
* in result is equal)
|
* TODO: handle tie better (when number of chars in 2 results is equal)
|
||||||
*/
|
*/
|
||||||
public class StringExtract {
|
public class StringExtract {
|
||||||
|
|
||||||
@ -179,16 +179,6 @@ public class StringExtract {
|
|||||||
return new StringExtractResult();
|
return new StringExtractResult();
|
||||||
}
|
}
|
||||||
|
|
||||||
//special case for Latin basic, use ASCII (fast extraction)
|
|
||||||
if (isExtractionLatinBasicOnly()) {
|
|
||||||
String extrStr = extractASCII(buff, len, offset);
|
|
||||||
StringExtractResult res = new StringExtractResult();
|
|
||||||
res.numBytes = len;
|
|
||||||
res.numChars = extrStr.length();
|
|
||||||
res.textString = extrStr;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int buffLen = buff.length;
|
final int buffLen = buff.length;
|
||||||
|
|
||||||
int processedBytes = 0;
|
int processedBytes = 0;
|
||||||
|
@ -109,8 +109,6 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
|||||||
this.numChunks = 0; //unknown until indexing is done
|
this.numChunks = 0; //unknown until indexing is done
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
|
|
||||||
//construct stream that extracts text as we read it
|
|
||||||
//final InputStream stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
|
|
||||||
|
|
||||||
final boolean extractUTF8 =
|
final boolean extractUTF8 =
|
||||||
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF8.toString()));
|
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
@ -118,8 +116,22 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
|||||||
final boolean extractUTF16 =
|
final boolean extractUTF16 =
|
||||||
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF16.toString()));
|
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
final InputStream stringStream = new AbstractFileStringIntStream(
|
if (extractUTF8 == false && extractUTF16 == false) {
|
||||||
|
//nothing to do
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
InputStream stringStream = null;
|
||||||
|
//check which extract stream to use
|
||||||
|
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1) ) {
|
||||||
|
//optimal for english, english only
|
||||||
|
stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
stringStream = new AbstractFileStringIntStream(
|
||||||
sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
|
sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
success = true;
|
success = true;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user