Better fast English only string extraction using AbstractFileStringStream (does not break words)

This commit is contained in:
adam-m 2012-09-13 10:57:06 -04:00
parent 29ae3c4601
commit e981a8519f
2 changed files with 19 additions and 17 deletions

View File

@ -34,9 +34,9 @@ import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.S
* Currently supports UTF-16 LE, UTF-16 BE and UTF8 Latin, Cyrillic, Chinese,
* Arabic
*
* TODO: - process control characters - testing: check non-printable common
* chars sometimes extracted (font?) - handle tie better (when number of chars
* in result is equal)
* TODO: process control characters
*
* TODO: handle tie better (when number of chars in 2 results is equal)
*/
public class StringExtract {
@ -179,16 +179,6 @@ public class StringExtract {
return new StringExtractResult();
}
//special case for Latin basic, use ASCII (fast extraction)
if (isExtractionLatinBasicOnly()) {
String extrStr = extractASCII(buff, len, offset);
StringExtractResult res = new StringExtractResult();
res.numBytes = len;
res.numChars = extrStr.length();
res.textString = extrStr;
return res;
}
final int buffLen = buff.length;
int processedBytes = 0;

View File

@ -109,8 +109,6 @@ class AbstractFileStringExtract implements AbstractFileExtract {
this.numChunks = 0; //unknown until indexing is done
boolean success = false;
//construct stream that extracts text as we read it
//final InputStream stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
final boolean extractUTF8 =
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF8.toString()));
@ -118,8 +116,22 @@ class AbstractFileStringExtract implements AbstractFileExtract {
final boolean extractUTF16 =
Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF16.toString()));
final InputStream stringStream = new AbstractFileStringIntStream(
if (extractUTF8 == false && extractUTF16 == false) {
//nothing to do
return true;
}
InputStream stringStream = null;
//check which extract stream to use
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1) ) {
//optimal for english, english only
stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
}
else {
stringStream = new AbstractFileStringIntStream(
sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
}
try {
success = true;