From ac4cc7c6241ad349e6294ee7a71cb5086b64d25e Mon Sep 17 00:00:00 2001 From: adam-m Date: Tue, 1 May 2012 13:43:38 -0400 Subject: [PATCH] better variable names, fix typo in ascii test, use UTF8 charset in stream, move string stream class --- .../autopsy/datamodel/DataConversion.java | 58 +++++++++---------- .../datamodel}/FsContentStringStream.java | 36 +++++++----- .../FsContentStringContentStream.java | 5 +- .../KeywordSearchIngestService.java | 3 +- 4 files changed, 53 insertions(+), 49 deletions(-) rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => DataModel/src/org/sleuthkit/autopsy/datamodel}/FsContentStringStream.java (89%) diff --git a/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java b/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java index a89330f343..5dc9635f99 100644 --- a/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java +++ b/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java @@ -119,54 +119,44 @@ public class DataConversion { * -- When looking for ASCII strings, they evaluate each byte and when they find four or more printable characters they get printed out with a newline in between each string. * -- When looking for Unicode strings, they evaluate each two byte sequence and look for four or more printable characters… * - * @param args the bytes that the string read from + * @param readBuf the bytes that the string read from * @param len length of text in the buffer to convert, starting at position 0 - * @param parameter the "length" parameter for the string + * @param minStringLen minimum length of consecutive chars to qualify as a string * + * TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE + * then process remainder of the string using detected encoding + * * @author jantonius */ - public static String getString(byte[] args, int len, int parameter) { - - /* - // these encoding might be needed for later - // Note: if not used, can be deleted - CharsetEncoder asciiEncoder = - Charset.forName("US-ASCII").newEncoder(); // or "ISO-8859-1" for ISO Latin 1 - - CharsetEncoder utf8Encoder = - Charset.forName("UTF-8").newEncoder(); - */ + public static String getString(byte[] readBuf, int len, int minStringLen) { final StringBuilder result = new StringBuilder(); StringBuilder temp = new StringBuilder(); - int counter = 0; - //char[] converted = new java.lang.System.Text.Encoding.ASCII.GetString(args).ToCharArray(); + int curLen = 0; final char NL = (char) 10; // ASCII char for new line final String NLS = Character.toString(NL); - boolean isZero = false; + boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char for (int i = 0; i < len; i++) { - char curChar = (char) args[i]; - - if (curChar == 0 && isZero == false) { - //allow to skip one 0 - isZero = true; + char curChar = (char) readBuf[i]; + if (curChar == 0 && singleConsecZero == false) { + //preserve the current sequence if max consec. 1 zero char + singleConsecZero = true; } else { - isZero = false; + singleConsecZero = false; } //ignore non-printable ASCII chars - //use 32-126 and not TAB ( 9) - if (isUsableChar(curChar)) { + if (isPrintableAscii(curChar)) { temp.append(curChar); - ++counter; - } else if (!isZero) { - if (counter >= parameter) { + ++curLen; + } else if (!singleConsecZero) { + if (curLen >= minStringLen) { // add to the result and also add the new line at the end result.append(temp); result.append(NLS); } - // reset the temp and counter + // reset the temp and curLen temp = new StringBuilder(); - counter = 0; + curLen = 0; } } @@ -175,8 +165,14 @@ public class DataConversion { return result.toString(); } - private static boolean isUsableChar(char c) { - return c >= 32 && c <= 126 && c != 9; + /** + * Determine if char is a printable ASCII char + * in range <32,126> and a tab + * @param c char to test + * @return true if it's a printable char, or false otherwise + */ + public static boolean isPrintableAscii(char c) { + return (c >= 32 && c <= 126) || c == 9; } /** diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java b/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java similarity index 89% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java rename to DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java index 984c9eacfa..753d696926 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java +++ b/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java @@ -18,7 +18,7 @@ */ -package org.sleuthkit.autopsy.keywordsearch; +package org.sleuthkit.autopsy.datamodel; import java.io.IOException; import java.io.InputStream; @@ -31,11 +31,20 @@ import org.sleuthkit.datamodel.TskException; /** * FsContent input string stream reader/converter + * TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE + * then process remainder of the string using detected encoding */ public class FsContentStringStream extends InputStream { public static enum Encoding { - ASCII, + UTF8 { + + @Override + public String toString() { + return "UTF-8"; + } + + }, }; private FsContent content; private String encoding; @@ -51,7 +60,7 @@ public class FsContentStringStream extends InputStream { private boolean isEOF = false; private boolean stringAtBoundary = false; //if temp has part of string that didn't make it in previous read() private static final byte[] oneCharBuf = new byte[1]; - private final int ASCII_CHARS_MIN = 4; //num. of chars needed to qualify as a char string + private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string private static final String NLS = Character.toString((char)10); //new line private static final Logger logger = Logger.getLogger(FsContentStringStream.class.getName()); @@ -99,7 +108,7 @@ public class FsContentStringStream extends InputStream { //there could be more to this string in fscontent/buffer } - boolean isZero = false; + boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char int newCurLen = curStringLen + tempStringLen; while (newCurLen < len) { //need to extract more strings @@ -134,20 +143,20 @@ public class FsContentStringStream extends InputStream { } //get char from cur read buf char c = (char) curReadBuf[readBufOffset++]; - if (c == 0 && isZero == false) { - //allow to skip one zero - isZero = true; + if (c == 0 && singleConsecZero == false) { + //preserve the current sequence if max consec. 1 zero char + singleConsecZero = true; } else { - isZero = false; + singleConsecZero = false; } - if (isUsableChar(c)) { + if (DataConversion.isPrintableAscii(c)) { tempString.append(c); ++tempStringLen; //boundary case handled after the loop - } else if (! isZero) { + } else if (! singleConsecZero) { //break the string, clear temp - if (tempStringLen >= ASCII_CHARS_MIN) { + if (tempStringLen >= MIN_PRINTABLE_CHARS) { //append entire temp string tempString.append(NLS); ++tempStringLen; @@ -166,7 +175,7 @@ public class FsContentStringStream extends InputStream { //check if temp still has chars to qualify as a string //we might need to break up temp into 2 parts for next read() call //consume as many as possible to fill entire user buffer - if (tempStringLen >= ASCII_CHARS_MIN) { + if (tempStringLen >= MIN_PRINTABLE_CHARS) { if (newCurLen > len) { int appendChars = len - curStringLen; //save part for next user read(), need to break up temp string @@ -225,9 +234,6 @@ public class FsContentStringStream extends InputStream { return 0; } - private static boolean isUsableChar(char c) { - return c >= 32 && c <= 126 && c != 9; - } @Override diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java index 20545f838f..6ad8e14d0f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java @@ -24,7 +24,8 @@ import java.io.InputStreamReader; import java.io.Reader; import java.util.logging.Logger; import org.apache.solr.common.util.ContentStream; -import org.sleuthkit.autopsy.keywordsearch.FsContentStringStream.Encoding; +import org.sleuthkit.autopsy.datamodel.FsContentStringStream; +import org.sleuthkit.autopsy.datamodel.FsContentStringStream.Encoding; import org.sleuthkit.datamodel.FsContent; /** @@ -55,7 +56,7 @@ public class FsContentStringContentStream implements ContentStream { @Override public String getContentType() { - return "text/plain; charset = " + encoding.toString(); + return "text/plain; charset=" + encoding.toString(); } @Override diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java index b67232d7ba..eeba30c02a 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java @@ -18,6 +18,7 @@ */ package org.sleuthkit.autopsy.keywordsearch; +import org.sleuthkit.autopsy.datamodel.FsContentStringStream; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.util.ArrayList; @@ -417,7 +418,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent private boolean extractAndIngest(FsContent f) { boolean success = false; - FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.ASCII); + FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.UTF8); try { ingester.ingest(fscs); success = true;