-process files with mail extension normally with tika first

- fscontentstringstream tweak
This commit is contained in:
adam-m 2012-05-01 23:08:35 -04:00
parent b7806a42ea
commit 2dcc8121a0
2 changed files with 26 additions and 11 deletions

View File

@ -55,7 +55,9 @@ public class FsContentStringStream extends InputStream {
private StringBuilder tempString = new StringBuilder();
private int tempStringLen = 0;
private boolean isEOF = false;
private boolean stringAtBoundary = false; //if temp has part of string that didn't make it in previous read()
private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
private boolean stringAtBufBoundary = false; //if continue string from prev read
private boolean inString = false; //if current temp has min chars required
private static final byte[] oneCharBuf = new byte[1];
private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
private static final String NLS = Character.toString((char) 10); //new line
@ -72,8 +74,6 @@ public class FsContentStringStream extends InputStream {
//logger.log(Level.INFO, "FILE: " + content.getParentPath() + "/" + content.getName());
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (b == null) {
@ -93,7 +93,7 @@ public class FsContentStringStream extends InputStream {
return -1;
}
if (stringAtBoundary) {
if (stringAtTempBoundary) {
//append entire temp string residual from previous read()
//because qualified string was broken down into 2 parts
curString.append(tempString);
@ -103,7 +103,7 @@ public class FsContentStringStream extends InputStream {
tempString = new StringBuilder();
tempStringLen = 0;
stringAtBoundary = false;
stringAtTempBoundary = false;
//there could be more to this string in fscontent/buffer
}
@ -154,16 +154,23 @@ public class FsContentStringStream extends InputStream {
if (DataConversion.isPrintableAscii(c)) {
tempString.append(c);
++tempStringLen;
//boundary case handled after the loop
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
inString = true;
}
//boundary case when temp has still chars - handled after the loop
} else if (!singleConsecZero) {
//break the string, clear temp
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
//append entire temp string
if (tempStringLen >= MIN_PRINTABLE_CHARS
|| stringAtBufBoundary) {
//append entire temp string with new line
tempString.append(NLS);
++tempStringLen;
curString.append(tempString);
curStringLen += tempStringLen;
stringAtBufBoundary = false;
}
//reset temp
tempString = new StringBuilder();
@ -173,6 +180,13 @@ public class FsContentStringStream extends InputStream {
newCurLen = curStringLen + tempStringLen;
}
//check if still in string state, so that next chars in read buf bypass min chars check
//and qualify as string even if less < min chars required
if (inString) {
inString = false; //reset
stringAtBufBoundary = true; //will bypass the check
}
//check if temp still has chars to qualify as a string
//we might need to break up temp into 2 parts for next read() call
//consume as many as possible to fill entire user buffer
@ -190,7 +204,7 @@ public class FsContentStringStream extends InputStream {
tempString = new StringBuilder(newTemp);
tempStringLen = newTemp.length();
stringAtBoundary = true;
stringAtTempBoundary = true;
} else {
//append entire temp
@ -213,7 +227,7 @@ public class FsContentStringStream extends InputStream {
return copied;
}
//append temp buffer to cur string buffer and reset temp, if enough chars
//does not append new line
private void appendResetTemp() {
@ -233,6 +247,7 @@ public class FsContentStringStream extends InputStream {
//logger.log(Level.INFO, curStringS);
byte[] stringBytes = curStringS.getBytes(encoding);
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
//logger.log(Level.INFO, curStringS);
//copied all string, reset
curString = new StringBuilder();
int ret = curStringLen;

View File

@ -55,7 +55,7 @@ class Ingester {
static final String[] ingestibleExtensions = {"tar", "jar", "zip", "gzip", "bzip2",
"gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
"bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav",
"pst", "xml", "class", "dwg"};
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
Ingester() {
}