better variable names, fix typo in ascii test, use UTF8 charset in stream, move string stream class

This commit is contained in:
adam-m 2012-05-01 13:43:38 -04:00
parent 95097e16d9
commit ac4cc7c624
4 changed files with 53 additions and 49 deletions

View File

@ -119,54 +119,44 @@ public class DataConversion {
* -- When looking for ASCII strings, they evaluate each byte and when they find four or more printable characters they get printed out with a newline in between each string.
* -- When looking for Unicode strings, they evaluate each two byte sequence and look for four or more printable characters
*
* @param args the bytes that the string read from
* @param readBuf the bytes that the string read from
* @param len length of text in the buffer to convert, starting at position 0
* @param parameter the "length" parameter for the string
* @param minStringLen minimum length of consecutive chars to qualify as a string
*
* TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
* then process remainder of the string using detected encoding
*
* @author jantonius
*/
public static String getString(byte[] args, int len, int parameter) {
/*
// these encoding might be needed for later
// Note: if not used, can be deleted
CharsetEncoder asciiEncoder =
Charset.forName("US-ASCII").newEncoder(); // or "ISO-8859-1" for ISO Latin 1
CharsetEncoder utf8Encoder =
Charset.forName("UTF-8").newEncoder();
*/
public static String getString(byte[] readBuf, int len, int minStringLen) {
final StringBuilder result = new StringBuilder();
StringBuilder temp = new StringBuilder();
int counter = 0;
//char[] converted = new java.lang.System.Text.Encoding.ASCII.GetString(args).ToCharArray();
int curLen = 0;
final char NL = (char) 10; // ASCII char for new line
final String NLS = Character.toString(NL);
boolean isZero = false;
boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
for (int i = 0; i < len; i++) {
char curChar = (char) args[i];
if (curChar == 0 && isZero == false) {
//allow to skip one 0
isZero = true;
char curChar = (char) readBuf[i];
if (curChar == 0 && singleConsecZero == false) {
//preserve the current sequence if max consec. 1 zero char
singleConsecZero = true;
} else {
isZero = false;
singleConsecZero = false;
}
//ignore non-printable ASCII chars
//use 32-126 and not TAB ( 9)
if (isUsableChar(curChar)) {
if (isPrintableAscii(curChar)) {
temp.append(curChar);
++counter;
} else if (!isZero) {
if (counter >= parameter) {
++curLen;
} else if (!singleConsecZero) {
if (curLen >= minStringLen) {
// add to the result and also add the new line at the end
result.append(temp);
result.append(NLS);
}
// reset the temp and counter
// reset the temp and curLen
temp = new StringBuilder();
counter = 0;
curLen = 0;
}
}
@ -175,8 +165,14 @@ public class DataConversion {
return result.toString();
}
private static boolean isUsableChar(char c) {
return c >= 32 && c <= 126 && c != 9;
/**
* Determine if char is a printable ASCII char
* in range <32,126> and a tab
* @param c char to test
* @return true if it's a printable char, or false otherwise
*/
public static boolean isPrintableAscii(char c) {
return (c >= 32 && c <= 126) || c == 9;
}
/**

View File

@ -18,7 +18,7 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
package org.sleuthkit.autopsy.datamodel;
import java.io.IOException;
import java.io.InputStream;
@ -31,11 +31,20 @@ import org.sleuthkit.datamodel.TskException;
/**
* FsContent input string stream reader/converter
* TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
* then process remainder of the string using detected encoding
*/
public class FsContentStringStream extends InputStream {
public static enum Encoding {
ASCII,
UTF8 {
@Override
public String toString() {
return "UTF-8";
}
},
};
private FsContent content;
private String encoding;
@ -51,7 +60,7 @@ public class FsContentStringStream extends InputStream {
private boolean isEOF = false;
private boolean stringAtBoundary = false; //if temp has part of string that didn't make it in previous read()
private static final byte[] oneCharBuf = new byte[1];
private final int ASCII_CHARS_MIN = 4; //num. of chars needed to qualify as a char string
private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
private static final String NLS = Character.toString((char)10); //new line
private static final Logger logger = Logger.getLogger(FsContentStringStream.class.getName());
@ -99,7 +108,7 @@ public class FsContentStringStream extends InputStream {
//there could be more to this string in fscontent/buffer
}
boolean isZero = false;
boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
int newCurLen = curStringLen + tempStringLen;
while (newCurLen < len) {
//need to extract more strings
@ -134,20 +143,20 @@ public class FsContentStringStream extends InputStream {
}
//get char from cur read buf
char c = (char) curReadBuf[readBufOffset++];
if (c == 0 && isZero == false) {
//allow to skip one zero
isZero = true;
if (c == 0 && singleConsecZero == false) {
//preserve the current sequence if max consec. 1 zero char
singleConsecZero = true;
}
else {
isZero = false;
singleConsecZero = false;
}
if (isUsableChar(c)) {
if (DataConversion.isPrintableAscii(c)) {
tempString.append(c);
++tempStringLen;
//boundary case handled after the loop
} else if (! isZero) {
} else if (! singleConsecZero) {
//break the string, clear temp
if (tempStringLen >= ASCII_CHARS_MIN) {
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
//append entire temp string
tempString.append(NLS);
++tempStringLen;
@ -166,7 +175,7 @@ public class FsContentStringStream extends InputStream {
//check if temp still has chars to qualify as a string
//we might need to break up temp into 2 parts for next read() call
//consume as many as possible to fill entire user buffer
if (tempStringLen >= ASCII_CHARS_MIN) {
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
if (newCurLen > len) {
int appendChars = len - curStringLen;
//save part for next user read(), need to break up temp string
@ -225,9 +234,6 @@ public class FsContentStringStream extends InputStream {
return 0;
}
private static boolean isUsableChar(char c) {
return c >= 32 && c <= 126 && c != 9;
}
@Override

View File

@ -24,7 +24,8 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.util.logging.Logger;
import org.apache.solr.common.util.ContentStream;
import org.sleuthkit.autopsy.keywordsearch.FsContentStringStream.Encoding;
import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
import org.sleuthkit.autopsy.datamodel.FsContentStringStream.Encoding;
import org.sleuthkit.datamodel.FsContent;
/**

View File

@ -18,6 +18,7 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.util.ArrayList;
@ -417,7 +418,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
private boolean extractAndIngest(FsContent f) {
boolean success = false;
FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.ASCII);
FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.UTF8);
try {
ingester.ingest(fscs);
success = true;