better variable names, fix typo in ascii test, use UTF8 charset in stream, move string stream class

2025-07-09 06:39:33 +00:00 · 2012-05-01 13:43:38 -04:00 · 2012-05-01 13:43:38 -04:00 · ac4cc7c624
commit ac4cc7c624
parent 95097e16d9
4 changed files with 53 additions and 49 deletions
--- a/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java
+++ b/DataModel/src/org/sleuthkit/autopsy/datamodel/DataConversion.java
@ -119,54 +119,44 @@ public class DataConversion {
     *  -- When looking for ASCII strings, they evaluate each byte and when they find four or more printable characters they get printed out with a newline in between each string.
     *  -- When looking for Unicode strings, they evaluate each two byte sequence and look for four or more printable characters…
     *
-     * @param args          the bytes that the string read from
+     * @param readBuf          the bytes that the string read from
     * @param len           length of text in the buffer to convert, starting at position 0
-     * @param parameter     the "length" parameter for the string
+     * @param minStringLen     minimum length of consecutive chars to qualify as a string
+     *
+     * TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
+     * then process remainder of the string using detected encoding  
     * 
     * @author jantonius
     */
-    public static String getString(byte[] args, int len, int parameter) {
-
-        /*
-        // these encoding might be needed for later
-        // Note: if not used, can be deleted
-        CharsetEncoder asciiEncoder =
-        Charset.forName("US-ASCII").newEncoder(); // or "ISO-8859-1" for ISO Latin 1
-        
-        CharsetEncoder utf8Encoder =
-        Charset.forName("UTF-8").newEncoder();
-         */
+    public static String getString(byte[] readBuf, int len, int minStringLen) {
        final StringBuilder result = new StringBuilder();
        StringBuilder temp = new StringBuilder();
-        int counter = 0;
-        //char[] converted = new java.lang.System.Text.Encoding.ASCII.GetString(args).ToCharArray();
+        int curLen = 0;

        final char NL = (char) 10; // ASCII char for new line
        final String NLS = Character.toString(NL);
-        boolean isZero = false;
+        boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
        for (int i = 0; i < len; i++) {
-            char curChar = (char) args[i];
-
-            if (curChar == 0 && isZero == false) {
-                //allow to skip one 0
-                isZero = true;
+            char curChar = (char) readBuf[i];
+            if (curChar == 0 && singleConsecZero == false) {
+                //preserve the current sequence if max consec. 1 zero char 
+                singleConsecZero = true;
            } else {
-                isZero = false;
+                singleConsecZero = false;
            }
            //ignore non-printable ASCII chars
-            //use 32-126 and not TAB ( 9)
-            if (isUsableChar(curChar)) {
+            if (isPrintableAscii(curChar)) {
                temp.append(curChar);
-                ++counter;
-            } else if (!isZero) {
-                if (counter >= parameter) {
+                ++curLen;
+            } else if (!singleConsecZero) {
+                if (curLen >= minStringLen) {
                    // add to the result and also add the new line at the end
                    result.append(temp);
                    result.append(NLS);
                }
-                // reset the temp and counter
+                // reset the temp and curLen
                temp = new StringBuilder();
-                counter = 0;
+                curLen = 0;

            }
        }
@ -175,8 +165,14 @@ public class DataConversion {
        return result.toString();
    }

-    private static boolean isUsableChar(char c) {
-        return c >= 32 && c <= 126 && c != 9;
+    /**
+     * Determine if char is a printable ASCII char
+     * in range <32,126> and a tab
+     * @param c char to test
+     * @return true if it's a printable char, or false otherwise
+     */
+    public static boolean isPrintableAscii(char c) {
+        return (c >= 32 && c <= 126) || c == 9;
    }

    /**
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
@ -18,7 +18,7 @@
 */


-package org.sleuthkit.autopsy.keywordsearch;
+package org.sleuthkit.autopsy.datamodel;

 import java.io.IOException;
 import java.io.InputStream;
@ -31,11 +31,20 @@ import org.sleuthkit.datamodel.TskException;

 /**
 * FsContent input string stream reader/converter
+ * TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
+ * then process remainder of the string using detected encoding  
 */
 public class FsContentStringStream extends InputStream {

    public static enum Encoding {
-        ASCII,
+        UTF8 {
+
+            @Override
+            public String toString() {
+                return "UTF-8";
+            }
+            
+        },
    };
    private FsContent content;
    private String encoding;
@ -51,7 +60,7 @@ public class FsContentStringStream extends InputStream {
    private boolean isEOF = false;
    private boolean stringAtBoundary = false; //if temp has part of string that didn't make it in previous read()
    private static final byte[] oneCharBuf = new byte[1];
-    private final int ASCII_CHARS_MIN = 4; //num. of chars needed to qualify as a char string
+    private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
    private static final String NLS = Character.toString((char)10); //new line
    private static final Logger logger = Logger.getLogger(FsContentStringStream.class.getName());

@ -99,7 +108,7 @@ public class FsContentStringStream extends InputStream {
            //there could be more to this string in fscontent/buffer
        }

-        boolean isZero = false;
+        boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
        int newCurLen = curStringLen + tempStringLen;
        while (newCurLen < len) {
            //need to extract more strings
@ -134,20 +143,20 @@ public class FsContentStringStream extends InputStream {
            }
            //get char from cur read buf
            char c = (char) curReadBuf[readBufOffset++];
-            if (c == 0 && isZero == false) {
-                //allow to skip one zero
-                isZero = true;
+            if (c == 0 && singleConsecZero == false) {
+                //preserve the current sequence if max consec. 1 zero char 
+                singleConsecZero = true;
            }
            else {
-                isZero = false;
+                singleConsecZero = false;
            }
-            if (isUsableChar(c)) {
+            if (DataConversion.isPrintableAscii(c)) {
                tempString.append(c);
                ++tempStringLen;
                //boundary case handled after the loop
-            } else if (! isZero) {
+            } else if (! singleConsecZero) {
                //break the string, clear temp
-                if (tempStringLen >= ASCII_CHARS_MIN) {
+                if (tempStringLen >= MIN_PRINTABLE_CHARS) {
                    //append entire temp string
                    tempString.append(NLS);
                    ++tempStringLen;
@ -166,7 +175,7 @@ public class FsContentStringStream extends InputStream {
        //check if temp still has chars to qualify as a string
        //we might need to break up temp into 2 parts for next read() call
        //consume as many as possible to fill entire user buffer
-        if (tempStringLen >= ASCII_CHARS_MIN) {
+        if (tempStringLen >= MIN_PRINTABLE_CHARS) {
            if (newCurLen > len) {
                int appendChars = len - curStringLen;
                //save part for next user read(), need to break up temp string
@ -225,9 +234,6 @@ public class FsContentStringStream extends InputStream {
        return 0;
    }

-    private static boolean isUsableChar(char c) {
-        return c >= 32 && c <= 126 && c != 9;
-    }


    @Override
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringContentStream.java
@ -24,7 +24,8 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.util.logging.Logger;
 import org.apache.solr.common.util.ContentStream;
-import org.sleuthkit.autopsy.keywordsearch.FsContentStringStream.Encoding;
+import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
+import org.sleuthkit.autopsy.datamodel.FsContentStringStream.Encoding;
 import org.sleuthkit.datamodel.FsContent;

 /**
@ -55,7 +56,7 @@ public class FsContentStringContentStream implements ContentStream {

    @Override
    public String getContentType() {
-        return "text/plain; charset = " + encoding.toString();
+        return "text/plain; charset=" + encoding.toString();
    }

    @Override
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
@ -18,6 +18,7 @@
 */
 package org.sleuthkit.autopsy.keywordsearch;

+import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
 import java.awt.event.ActionEvent;
 import java.awt.event.ActionListener;
 import java.util.ArrayList;
@ -417,7 +418,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent

        private boolean extractAndIngest(FsContent f) {
            boolean success = false;
-            FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.ASCII);
+            FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.UTF8);
            try {
                ingester.ingest(fscs);
                success = true;