From ac4cc7c6241ad349e6294ee7a71cb5086b64d25e Mon Sep 17 00:00:00 2001
From: adam-m <amalinowski@basistech.com>
Date: Tue, 1 May 2012 13:43:38 -0400
Subject: [PATCH] better variable names, fix typo in ascii test, use UTF8
 charset in stream, move string stream class

---
 .../autopsy/datamodel/DataConversion.java     | 58 +++++++++----------
 .../datamodel}/FsContentStringStream.java     | 36 +++++++-----
 .../FsContentStringContentStream.java         |  5 +-
 .../KeywordSearchIngestService.java           |  3 +-
 4 files changed, 53 insertions(+), 49 deletions(-)
 rename {KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch => DataModel/src/org/sleuthkit/autopsy/datamodel}/FsContentStringStream.java (89%)

diff --git a/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java b/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java
index a89330f343..5dc9635f99 100644
--- a/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java
+++ b/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java
@@ -119,54 +119,44 @@ public class DataConversion {
      *  -- When looking for ASCII strings, they evaluate each byte and when they find four or more printable characters they get printed out with a newline in between each string.
      *  -- When looking for Unicode strings, they evaluate each two byte sequence and look for four or more printable characters…
      *
-     * @param args          the bytes that the string read from
+     * @param readBuf          the bytes that the string read from
      * @param len           length of text in the buffer to convert, starting at position 0
-     * @param parameter     the "length" parameter for the string
+     * @param minStringLen     minimum length of consecutive chars to qualify as a string
      *
+     * TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
+     * then process remainder of the string using detected encoding  
+     * 
      * @author jantonius
      */
-    public static String getString(byte[] args, int len, int parameter) {
-
-        /*
-        // these encoding might be needed for later
-        // Note: if not used, can be deleted
-        CharsetEncoder asciiEncoder =
-        Charset.forName("US-ASCII").newEncoder(); // or "ISO-8859-1" for ISO Latin 1
-        
-        CharsetEncoder utf8Encoder =
-        Charset.forName("UTF-8").newEncoder();
-         */
+    public static String getString(byte[] readBuf, int len, int minStringLen) {
         final StringBuilder result = new StringBuilder();
         StringBuilder temp = new StringBuilder();
-        int counter = 0;
-        //char[] converted = new java.lang.System.Text.Encoding.ASCII.GetString(args).ToCharArray();
+        int curLen = 0;
 
         final char NL = (char) 10; // ASCII char for new line
         final String NLS = Character.toString(NL);
-        boolean isZero = false;
+        boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
         for (int i = 0; i < len; i++) {
-            char curChar = (char) args[i];
-
-            if (curChar == 0 && isZero == false) {
-                //allow to skip one 0
-                isZero = true;
+            char curChar = (char) readBuf[i];
+            if (curChar == 0 && singleConsecZero == false) {
+                //preserve the current sequence if max consec. 1 zero char 
+                singleConsecZero = true;
             } else {
-                isZero = false;
+                singleConsecZero = false;
             }
             //ignore non-printable ASCII chars
-            //use 32-126 and not TAB ( 9)
-            if (isUsableChar(curChar)) {
+            if (isPrintableAscii(curChar)) {
                 temp.append(curChar);
-                ++counter;
-            } else if (!isZero) {
-                if (counter >= parameter) {
+                ++curLen;
+            } else if (!singleConsecZero) {
+                if (curLen >= minStringLen) {
                     // add to the result and also add the new line at the end
                     result.append(temp);
                     result.append(NLS);
                 }
-                // reset the temp and counter
+                // reset the temp and curLen
                 temp = new StringBuilder();
-                counter = 0;
+                curLen = 0;
 
             }
         }
@@ -175,8 +165,14 @@ public class DataConversion {
         return result.toString();
     }
 
-    private static boolean isUsableChar(char c) {
-        return c >= 32 && c <= 126 && c != 9;
+    /**
+     * Determine if char is a printable ASCII char
+     * in range <32,126> and a tab
+     * @param c char to test
+     * @return true if it's a printable char, or false otherwise
+     */
+    public static boolean isPrintableAscii(char c) {
+        return (c >= 32 && c <= 126) || c == 9;
     }
 
     /**
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java b/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java
similarity index 89%
rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
rename to DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java
index 984c9eacfa..753d696926 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
+++ b/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java
@@ -18,7 +18,7 @@
  */
 
 
-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.datamodel;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -31,11 +31,20 @@ import org.sleuthkit.datamodel.TskException;
 
 /**
  * FsContent input string stream reader/converter
+ * TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
+ * then process remainder of the string using detected encoding  
  */
 public class FsContentStringStream extends InputStream {
 
     public static enum Encoding {
-        ASCII,
+        UTF8 {
+
+            @Override
+            public String toString() {
+                return "UTF-8";
+            }
+            
+        },
     };
     private FsContent content;
     private String encoding;
@@ -51,7 +60,7 @@ public class FsContentStringStream extends InputStream {
     private boolean isEOF = false;
     private boolean stringAtBoundary = false; //if temp has part of string that didn't make it in previous read()
     private static final byte[] oneCharBuf = new byte[1];
-    private final int ASCII_CHARS_MIN = 4; //num. of chars needed to qualify as a char string
+    private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
     private static final String NLS = Character.toString((char)10); //new line
     private static final Logger logger = Logger.getLogger(FsContentStringStream.class.getName());
 
@@ -99,7 +108,7 @@ public class FsContentStringStream extends InputStream {
             //there could be more to this string in fscontent/buffer
         }
 
-        boolean isZero = false;
+        boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
         int newCurLen = curStringLen + tempStringLen;
         while (newCurLen < len) {
             //need to extract more strings
@@ -134,20 +143,20 @@ public class FsContentStringStream extends InputStream {
             }
             //get char from cur read buf
             char c = (char) curReadBuf[readBufOffset++];
-            if (c == 0 && isZero == false) {
-                //allow to skip one zero
-                isZero = true;
+            if (c == 0 && singleConsecZero == false) {
+                //preserve the current sequence if max consec. 1 zero char 
+                singleConsecZero = true;
             }
             else {
-                isZero = false;
+                singleConsecZero = false;
             }
-            if (isUsableChar(c)) {
+            if (DataConversion.isPrintableAscii(c)) {
                 tempString.append(c);
                 ++tempStringLen;
                 //boundary case handled after the loop
-            } else if (! isZero) {
+            } else if (! singleConsecZero) {
                 //break the string, clear temp
-                if (tempStringLen >= ASCII_CHARS_MIN) {
+                if (tempStringLen >= MIN_PRINTABLE_CHARS) {
                     //append entire temp string
                     tempString.append(NLS);
                     ++tempStringLen;
@@ -166,7 +175,7 @@ public class FsContentStringStream extends InputStream {
         //check if temp still has chars to qualify as a string
         //we might need to break up temp into 2 parts for next read() call
         //consume as many as possible to fill entire user buffer
-        if (tempStringLen >= ASCII_CHARS_MIN) {
+        if (tempStringLen >= MIN_PRINTABLE_CHARS) {
             if (newCurLen > len) {
                 int appendChars = len - curStringLen;
                 //save part for next user read(), need to break up temp string
@@ -225,9 +234,6 @@ public class FsContentStringStream extends InputStream {
         return 0;
     }
 
-    private static boolean isUsableChar(char c) {
-        return c >= 32 && c <= 126 && c != 9;
-    }
 
 
     @Override
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java
index 20545f838f..6ad8e14d0f 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java
@@ -24,7 +24,8 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.util.logging.Logger;
 import org.apache.solr.common.util.ContentStream;
-import org.sleuthkit.autopsy.keywordsearch.FsContentStringStream.Encoding;
+import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
+import org.sleuthkit.autopsy.datamodel.FsContentStringStream.Encoding;
 import org.sleuthkit.datamodel.FsContent;
 
 /**
@@ -55,7 +56,7 @@ public class FsContentStringContentStream implements ContentStream {
 
     @Override
     public String getContentType() {
-        return "text/plain; charset = " + encoding.toString();
+        return "text/plain; charset=" + encoding.toString();
     }
 
     @Override
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
index b67232d7ba..eeba30c02a 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
@@ -18,6 +18,7 @@
  */
 package org.sleuthkit.autopsy.keywordsearch;
 
+import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
 import java.awt.event.ActionEvent;
 import java.awt.event.ActionListener;
 import java.util.ArrayList;
@@ -417,7 +418,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
 
         private boolean extractAndIngest(FsContent f) {
             boolean success = false;
-            FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.ASCII);
+            FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.UTF8);
             try {
                 ingester.ingest(fscs);
                 success = true;