From e981a8519fbc17bdbf26f0ec217b49c68e3a8546 Mon Sep 17 00:00:00 2001 From: adam-m Date: Thu, 13 Sep 2012 10:57:06 -0400 Subject: [PATCH] Better fast English only string extraction using AbstractFileStringStream (does not break words) --- .../autopsy/coreutils/StringExtract.java | 16 +++------------ .../AbstractFileStringExtract.java | 20 +++++++++++++++---- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/CoreUtils/src/org/sleuthkit/autopsy/coreutils/StringExtract.java b/CoreUtils/src/org/sleuthkit/autopsy/coreutils/StringExtract.java index 90287bfa5c..1063f7570d 100644 --- a/CoreUtils/src/org/sleuthkit/autopsy/coreutils/StringExtract.java +++ b/CoreUtils/src/org/sleuthkit/autopsy/coreutils/StringExtract.java @@ -34,9 +34,9 @@ import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.S * Currently supports UTF-16 LE, UTF-16 BE and UTF8 Latin, Cyrillic, Chinese, * Arabic * - * TODO: - process control characters - testing: check non-printable common - * chars sometimes extracted (font?) - handle tie better (when number of chars - * in result is equal) + * TODO: process control characters + * + * TODO: handle tie better (when number of chars in 2 results is equal) */ public class StringExtract { @@ -179,16 +179,6 @@ public class StringExtract { return new StringExtractResult(); } - //special case for Latin basic, use ASCII (fast extraction) - if (isExtractionLatinBasicOnly()) { - String extrStr = extractASCII(buff, len, offset); - StringExtractResult res = new StringExtractResult(); - res.numBytes = len; - res.numChars = extrStr.length(); - res.textString = extrStr; - return res; - } - final int buffLen = buff.length; int processedBytes = 0; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java index 6d7e50680a..072442aac4 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java @@ -109,17 +109,29 @@ class AbstractFileStringExtract implements AbstractFileExtract { this.numChunks = 0; //unknown until indexing is done boolean success = false; - //construct stream that extracts text as we read it - //final InputStream stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET); - + final boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF8.toString())); final boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(AbstractFileExtract.ExtractOptions.EXTRACT_UTF16.toString())); - final InputStream stringStream = new AbstractFileStringIntStream( + if (extractUTF8 == false && extractUTF16 == false) { + //nothing to do + return true; + } + + InputStream stringStream = null; + //check which extract stream to use + if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1) ) { + //optimal for english, english only + stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET); + } + else { + stringStream = new AbstractFileStringIntStream( sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET); + } + try { success = true;