better variable names, fix typo in ascii test, use UTF8 charset in stream, move string stream class

This commit is contained in:
adam-m 2012-05-01 13:43:38 -04:00
parent 95097e16d9
commit ac4cc7c624
4 changed files with 53 additions and 49 deletions

View File

@ -119,54 +119,44 @@ public class DataConversion {
* -- When looking for ASCII strings, they evaluate each byte and when they find four or more printable characters they get printed out with a newline in between each string. * -- When looking for ASCII strings, they evaluate each byte and when they find four or more printable characters they get printed out with a newline in between each string.
* -- When looking for Unicode strings, they evaluate each two byte sequence and look for four or more printable characters * -- When looking for Unicode strings, they evaluate each two byte sequence and look for four or more printable characters
* *
* @param args the bytes that the string read from * @param readBuf the bytes that the string read from
* @param len length of text in the buffer to convert, starting at position 0 * @param len length of text in the buffer to convert, starting at position 0
* @param parameter the "length" parameter for the string * @param minStringLen minimum length of consecutive chars to qualify as a string
* *
* TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
* then process remainder of the string using detected encoding
*
* @author jantonius * @author jantonius
*/ */
public static String getString(byte[] args, int len, int parameter) { public static String getString(byte[] readBuf, int len, int minStringLen) {
/*
// these encoding might be needed for later
// Note: if not used, can be deleted
CharsetEncoder asciiEncoder =
Charset.forName("US-ASCII").newEncoder(); // or "ISO-8859-1" for ISO Latin 1
CharsetEncoder utf8Encoder =
Charset.forName("UTF-8").newEncoder();
*/
final StringBuilder result = new StringBuilder(); final StringBuilder result = new StringBuilder();
StringBuilder temp = new StringBuilder(); StringBuilder temp = new StringBuilder();
int counter = 0; int curLen = 0;
//char[] converted = new java.lang.System.Text.Encoding.ASCII.GetString(args).ToCharArray();
final char NL = (char) 10; // ASCII char for new line final char NL = (char) 10; // ASCII char for new line
final String NLS = Character.toString(NL); final String NLS = Character.toString(NL);
boolean isZero = false; boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
char curChar = (char) args[i]; char curChar = (char) readBuf[i];
if (curChar == 0 && singleConsecZero == false) {
if (curChar == 0 && isZero == false) { //preserve the current sequence if max consec. 1 zero char
//allow to skip one 0 singleConsecZero = true;
isZero = true;
} else { } else {
isZero = false; singleConsecZero = false;
} }
//ignore non-printable ASCII chars //ignore non-printable ASCII chars
//use 32-126 and not TAB ( 9) if (isPrintableAscii(curChar)) {
if (isUsableChar(curChar)) {
temp.append(curChar); temp.append(curChar);
++counter; ++curLen;
} else if (!isZero) { } else if (!singleConsecZero) {
if (counter >= parameter) { if (curLen >= minStringLen) {
// add to the result and also add the new line at the end // add to the result and also add the new line at the end
result.append(temp); result.append(temp);
result.append(NLS); result.append(NLS);
} }
// reset the temp and counter // reset the temp and curLen
temp = new StringBuilder(); temp = new StringBuilder();
counter = 0; curLen = 0;
} }
} }
@ -175,8 +165,14 @@ public class DataConversion {
return result.toString(); return result.toString();
} }
private static boolean isUsableChar(char c) { /**
return c >= 32 && c <= 126 && c != 9; * Determine if char is a printable ASCII char
* in range <32,126> and a tab
* @param c char to test
* @return true if it's a printable char, or false otherwise
*/
public static boolean isPrintableAscii(char c) {
return (c >= 32 && c <= 126) || c == 9;
} }
/** /**

View File

@ -18,7 +18,7 @@
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.datamodel;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -31,11 +31,20 @@ import org.sleuthkit.datamodel.TskException;
/** /**
* FsContent input string stream reader/converter * FsContent input string stream reader/converter
* TODO should be encoding specific and detect UTF8, UTF16LE, UTF16BE
* then process remainder of the string using detected encoding
*/ */
public class FsContentStringStream extends InputStream { public class FsContentStringStream extends InputStream {
public static enum Encoding { public static enum Encoding {
ASCII, UTF8 {
@Override
public String toString() {
return "UTF-8";
}
},
}; };
private FsContent content; private FsContent content;
private String encoding; private String encoding;
@ -51,7 +60,7 @@ public class FsContentStringStream extends InputStream {
private boolean isEOF = false; private boolean isEOF = false;
private boolean stringAtBoundary = false; //if temp has part of string that didn't make it in previous read() private boolean stringAtBoundary = false; //if temp has part of string that didn't make it in previous read()
private static final byte[] oneCharBuf = new byte[1]; private static final byte[] oneCharBuf = new byte[1];
private final int ASCII_CHARS_MIN = 4; //num. of chars needed to qualify as a char string private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
private static final String NLS = Character.toString((char)10); //new line private static final String NLS = Character.toString((char)10); //new line
private static final Logger logger = Logger.getLogger(FsContentStringStream.class.getName()); private static final Logger logger = Logger.getLogger(FsContentStringStream.class.getName());
@ -99,7 +108,7 @@ public class FsContentStringStream extends InputStream {
//there could be more to this string in fscontent/buffer //there could be more to this string in fscontent/buffer
} }
boolean isZero = false; boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
int newCurLen = curStringLen + tempStringLen; int newCurLen = curStringLen + tempStringLen;
while (newCurLen < len) { while (newCurLen < len) {
//need to extract more strings //need to extract more strings
@ -134,20 +143,20 @@ public class FsContentStringStream extends InputStream {
} }
//get char from cur read buf //get char from cur read buf
char c = (char) curReadBuf[readBufOffset++]; char c = (char) curReadBuf[readBufOffset++];
if (c == 0 && isZero == false) { if (c == 0 && singleConsecZero == false) {
//allow to skip one zero //preserve the current sequence if max consec. 1 zero char
isZero = true; singleConsecZero = true;
} }
else { else {
isZero = false; singleConsecZero = false;
} }
if (isUsableChar(c)) { if (DataConversion.isPrintableAscii(c)) {
tempString.append(c); tempString.append(c);
++tempStringLen; ++tempStringLen;
//boundary case handled after the loop //boundary case handled after the loop
} else if (! isZero) { } else if (! singleConsecZero) {
//break the string, clear temp //break the string, clear temp
if (tempStringLen >= ASCII_CHARS_MIN) { if (tempStringLen >= MIN_PRINTABLE_CHARS) {
//append entire temp string //append entire temp string
tempString.append(NLS); tempString.append(NLS);
++tempStringLen; ++tempStringLen;
@ -166,7 +175,7 @@ public class FsContentStringStream extends InputStream {
//check if temp still has chars to qualify as a string //check if temp still has chars to qualify as a string
//we might need to break up temp into 2 parts for next read() call //we might need to break up temp into 2 parts for next read() call
//consume as many as possible to fill entire user buffer //consume as many as possible to fill entire user buffer
if (tempStringLen >= ASCII_CHARS_MIN) { if (tempStringLen >= MIN_PRINTABLE_CHARS) {
if (newCurLen > len) { if (newCurLen > len) {
int appendChars = len - curStringLen; int appendChars = len - curStringLen;
//save part for next user read(), need to break up temp string //save part for next user read(), need to break up temp string
@ -225,9 +234,6 @@ public class FsContentStringStream extends InputStream {
return 0; return 0;
} }
private static boolean isUsableChar(char c) {
return c >= 32 && c <= 126 && c != 9;
}
@Override @Override

View File

@ -24,7 +24,8 @@ import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.util.logging.Logger; import java.util.logging.Logger;
import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStream;
import org.sleuthkit.autopsy.keywordsearch.FsContentStringStream.Encoding; import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
import org.sleuthkit.autopsy.datamodel.FsContentStringStream.Encoding;
import org.sleuthkit.datamodel.FsContent; import org.sleuthkit.datamodel.FsContent;
/** /**
@ -55,7 +56,7 @@ public class FsContentStringContentStream implements ContentStream {
@Override @Override
public String getContentType() { public String getContentType() {
return "text/plain; charset = " + encoding.toString(); return "text/plain; charset=" + encoding.toString();
} }
@Override @Override

View File

@ -18,6 +18,7 @@
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import org.sleuthkit.autopsy.datamodel.FsContentStringStream;
import java.awt.event.ActionEvent; import java.awt.event.ActionEvent;
import java.awt.event.ActionListener; import java.awt.event.ActionListener;
import java.util.ArrayList; import java.util.ArrayList;
@ -417,7 +418,7 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
private boolean extractAndIngest(FsContent f) { private boolean extractAndIngest(FsContent f) {
boolean success = false; boolean success = false;
FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.ASCII); FsContentStringContentStream fscs = new FsContentStringContentStream(f, FsContentStringStream.Encoding.UTF8);
try { try {
ingester.ingest(fscs); ingester.ingest(fscs);
success = true; success = true;