Merge remote-tracking branch 'upstream/develop' into mt-3rdparty

This commit is contained in:
Samuel H. Kenyon 2014-04-16 14:21:12 -04:00
commit 6b648510cd
9 changed files with 60 additions and 65 deletions

View File

@ -122,7 +122,7 @@ public class IngestManager {
* *
* @return True if any ingest jobs are in progress, false otherwise * @return True if any ingest jobs are in progress, false otherwise
*/ */
public boolean isIngestRunning() { public synchronized boolean isIngestRunning() {
return (ingestJobs.isEmpty() == false); return (ingestJobs.isEmpty() == false);
} }

View File

@ -40,17 +40,18 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
class AbstractFileHtmlExtract implements AbstractFileExtract { class AbstractFileHtmlExtract implements AbstractFileExtract {
private static final Logger logger = Logger.getLogger(AbstractFileHtmlExtract.class.getName()); private static final Logger logger = Logger.getLogger(AbstractFileHtmlExtract.class.getName());
private static Ingester ingester;
static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET; static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
private static final int SINGLE_READ_CHARS = 1024; private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace private static final int EXTRA_CHARS = 128; //for whitespace
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
private static final int MAX_SIZE = 50000000; private static final int MAX_SIZE = 50000000;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private KeywordSearchIngestModule module; private KeywordSearchIngestModule module;
private Ingester ingester;
private AbstractFile sourceFile; private AbstractFile sourceFile;
private int numChunks = 0; private int numChunks = 0;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
static final List<String> WEB_MIME_TYPES = Arrays.asList( static final List<String> WEB_MIME_TYPES = Arrays.asList(
"application/javascript", "application/javascript",
"application/xhtml+xml", "application/xhtml+xml",
@ -98,7 +99,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
@Override @Override
public boolean index(AbstractFile sourceFile) throws IngesterException { public boolean index(AbstractFile sourceFile) throws IngesterException {
this.sourceFile = sourceFile; this.sourceFile = sourceFile;
this.numChunks = 0; //unknown until indexing is done numChunks = 0; //unknown until indexing is done
boolean success = false; boolean success = false;
Reader reader = null; Reader reader = null;
@ -122,12 +123,12 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
long totalRead = 0; long totalRead = 0;
boolean eof = false; boolean eof = false;
//we read max 1024 chars at time, this seems to max what this Reader would return //we read max 1024 chars at time, this seems to max what this Reader would return
while (!eof && (readSize = reader.read(TEXT_CHUNK_BUF, 0, SINGLE_READ_CHARS)) != -1) { while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize; totalRead += readSize;
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word) //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, SINGLE_READ_CHARS)) != -1) { && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize; totalRead += readSize;
} }
if (readSize == -1) { if (readSize == -1) {
@ -136,8 +137,8 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
} else { } else {
//try to read until whitespace to not break words //try to read until whitespace to not break words
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1) while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
&& !Character.isWhitespace(TEXT_CHUNK_BUF[(int) totalRead - 1]) && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
&& (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, 1)) != -1) { && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
totalRead += readSize; totalRead += readSize;
} }
if (readSize == -1) { if (readSize == -1) {
@ -156,9 +157,9 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
//sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
if (totalRead < MAX_EXTR_TEXT_CHARS) { if (totalRead < MAX_EXTR_TEXT_CHARS) {
sb.append(TEXT_CHUNK_BUF, 0, (int) totalRead); sb.append(textChunkBuf, 0, (int) totalRead);
} else { } else {
sb.append(TEXT_CHUNK_BUF); sb.append(textChunkBuf);
} }
//reset for next chunk //reset for next chunk

View File

@ -37,33 +37,31 @@ import org.sleuthkit.datamodel.AbstractFile;
*/ */
class AbstractFileStringExtract implements AbstractFileExtract { class AbstractFileStringExtract implements AbstractFileExtract {
private KeywordSearchIngestModule module; private static Ingester ingester;
private Ingester ingester;
private int numChunks;
private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName()); private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
private AbstractFile sourceFile;
//single static buffer for all extractions. Safe, indexing can only happen in one thread
private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
//private static final int BOM_LEN = 3; //private static final int BOM_LEN = 3;
private static final int BOM_LEN = 0; //disabled prepending of BOM private static final int BOM_LEN = 0; //disabled prepending of BOM
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2; private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
private final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
private KeywordSearchIngestModule module;
private AbstractFile sourceFile;
private int numChunks = 0;
private final List<SCRIPT> extractScripts = new ArrayList<SCRIPT>(); private final List<SCRIPT> extractScripts = new ArrayList<SCRIPT>();
private Map<String, String> extractOptions = new HashMap<String, String>(); private Map<String, String> extractOptions = new HashMap<String, String>();
//disabled prepending of BOM //disabled prepending of BOM
//static { //static {
//prepend UTF-8 BOM to start of the buffer //prepend UTF-8 BOM to start of the buffer
//STRING_CHUNK_BUF[0] = (byte) 0xEF; //stringChunkBuf[0] = (byte) 0xEF;
//STRING_CHUNK_BUF[1] = (byte) 0xBB; //stringChunkBuf[1] = (byte) 0xBB;
//STRING_CHUNK_BUF[2] = (byte) 0xBF; //stringChunkBuf[2] = (byte) 0xBF;
//} //}
public AbstractFileStringExtract(KeywordSearchIngestModule module) { public AbstractFileStringExtract(KeywordSearchIngestModule module) {
this.module = module; this.module = module;
this.ingester = Server.getIngester(); ingester = Server.getIngester();
this.extractScripts.add(DEFAULT_SCRIPT); extractScripts.add(DEFAULT_SCRIPT);
} }
@Override @Override
@ -132,14 +130,14 @@ class AbstractFileStringExtract implements AbstractFileExtract {
//break input stream into chunks //break input stream into chunks
long readSize = 0; long readSize = 0;
while ((readSize = stringStream.read(STRING_CHUNK_BUF, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) { while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1)); //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
//debug.write(STRING_CHUNK_BUF, 0, (int)readSize); //debug.write(stringChunkBuf, 0, (int)readSize);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
try { try {
chunk.index(ingester, STRING_CHUNK_BUF, readSize + BOM_LEN, INDEX_CHARSET); chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
++this.numChunks; ++this.numChunks;
} catch (IngesterException ingEx) { } catch (IngesterException ingEx) {
success = false; success = false;

View File

@ -34,15 +34,14 @@ import org.sleuthkit.datamodel.TskCoreException;
* object, extract international strings from the file and read output as a * object, extract international strings from the file and read output as a
* stream of UTF-8 strings as encoded bytes. * stream of UTF-8 strings as encoded bytes.
* *
* Currently not-thread safe (reusing static buffers for efficiency)
*/ */
class AbstractFileStringIntStream extends InputStream { class AbstractFileStringIntStream extends InputStream {
private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
private static final int FILE_BUF_SIZE = 1024 * 1024;
private AbstractFile content; private AbstractFile content;
private final byte[] oneCharBuf = new byte[1]; private final byte[] oneCharBuf = new byte[1];
private final StringExtract stringExtractor; private final StringExtract stringExtractor;
private static final int FILE_BUF_SIZE = 1024 * 1024; private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
private static final byte[] fileReadBuff = new byte[FILE_BUF_SIZE]; //NOTE: need to run all stream extraction in same thread
private long fileReadOffset = 0L; private long fileReadOffset = 0L;
private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
private int convertBuffOffset = 0; //offset to start returning data to user on next read() private int convertBuffOffset = 0; //offset to start returning data to user on next read()
@ -51,7 +50,7 @@ import org.sleuthkit.datamodel.TskCoreException;
private boolean extractUTF8; private boolean extractUTF8;
private boolean extractUTF16; private boolean extractUTF16;
private Charset outCharset; private Charset outCharset;
private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
private StringExtractResult lastExtractResult; private StringExtractResult lastExtractResult;
/** /**

View File

@ -42,9 +42,11 @@ import org.sleuthkit.datamodel.TskException;
private AbstractFile content; private AbstractFile content;
private Charset outputCharset; private Charset outputCharset;
//internal data //internal data
private long contentOffset = 0; //offset in fscontent read into curReadBuf private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
private static final String NLS = Character.toString((char) 10); //new line
private static final int READ_BUF_SIZE = 256; private static final int READ_BUF_SIZE = 256;
private static final byte[] curReadBuf = new byte[READ_BUF_SIZE]; private long contentOffset = 0; //offset in fscontent read into curReadBuf
private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
private int bytesInReadBuf = 0; private int bytesInReadBuf = 0;
private int readBufOffset = 0; //offset in read buf processed private int readBufOffset = 0; //offset in read buf processed
private StringBuilder curString = new StringBuilder(); private StringBuilder curString = new StringBuilder();
@ -55,10 +57,8 @@ import org.sleuthkit.datamodel.TskException;
private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read() private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read() private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
private boolean inString = false; //if current temp has min chars required private boolean inString = false; //if current temp has min chars required
private static final byte[] oneCharBuf = new byte[1]; private final byte[] oneCharBuf = new byte[1];
private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
private static final String NLS = Character.toString((char) 10); //new line
private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
/** /**
* Construct new string stream from FsContent * Construct new string stream from FsContent

View File

@ -57,17 +57,16 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
class AbstractFileTikaTextExtract implements AbstractFileExtract { class AbstractFileTikaTextExtract implements AbstractFileExtract {
private static final Logger logger = Logger.getLogger(AbstractFileTikaTextExtract.class.getName()); private static final Logger logger = Logger.getLogger(AbstractFileTikaTextExtract.class.getName());
private static Ingester ingester;
private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
private static final int SINGLE_READ_CHARS = 1024; private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace private static final int EXTRA_CHARS = 128; //for whitespace
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS]; //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
//private Tika tika; private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private KeywordSearchIngestModule module; private KeywordSearchIngestModule module;
private static Ingester ingester;
private AbstractFile sourceFile; //currently processed file private AbstractFile sourceFile; //currently processed file
private int numChunks = 0; private int numChunks = 0;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>(); private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
@ -80,7 +79,6 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype()); TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
} }
logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES);
} }
@Override @Override
@ -115,7 +113,7 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
@Override @Override
public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException { public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
this.sourceFile = sourceFile; this.sourceFile = sourceFile;
this.numChunks = 0; //unknown until indexing is done numChunks = 0; //unknown until indexing is done
boolean success = false; boolean success = false;
Reader reader = null; Reader reader = null;
@ -156,12 +154,12 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
long totalRead = 0; long totalRead = 0;
boolean eof = false; boolean eof = false;
//we read max 1024 chars at time, this seems to max what this Reader would return //we read max 1024 chars at time, this seems to max what this Reader would return
while (!eof && (readSize = reader.read(TEXT_CHUNK_BUF, 0, SINGLE_READ_CHARS)) != -1) { while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize; totalRead += readSize;
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word) //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, SINGLE_READ_CHARS)) != -1) { && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize; totalRead += readSize;
} }
if (readSize == -1) { if (readSize == -1) {
@ -170,8 +168,8 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
} else { } else {
//try to read char-by-char until whitespace to not break words //try to read char-by-char until whitespace to not break words
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1) while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
&& !Character.isWhitespace(TEXT_CHUNK_BUF[(int) totalRead - 1]) && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
&& (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, 1)) != -1) { && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
totalRead += readSize; totalRead += readSize;
} }
if (readSize == -1) { if (readSize == -1) {
@ -189,9 +187,9 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
//sb.append(UTF16BOM); disabled prepending of BOM //sb.append(UTF16BOM); disabled prepending of BOM
if (totalRead < MAX_EXTR_TEXT_CHARS) { if (totalRead < MAX_EXTR_TEXT_CHARS) {
sb.append(TEXT_CHUNK_BUF, 0, (int) totalRead); sb.append(textChunkBuf, 0, (int) totalRead);
} else { } else {
sb.append(TEXT_CHUNK_BUF); sb.append(textChunkBuf);
} }
//reset for next chunk //reset for next chunk
@ -211,6 +209,7 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
extracted = sb.toString(); extracted = sb.toString();
//converts BOM automatically to charSet encoding //converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET); byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);

View File

@ -395,7 +395,7 @@ class Ingester {
try { try {
solrServer.closeCore(); solrServer.closeCore();
} catch (KeywordSearchModuleException ex) { } catch (KeywordSearchModuleException ex) {
logger.log(Level.WARNING, "Cannot close core while restating", ex); logger.log(Level.WARNING, "Cannot close core", ex);
} }
solrServer.stop(); solrServer.stop();
@ -403,7 +403,7 @@ class Ingester {
try { try {
solrServer.start(); solrServer.start();
} catch (KeywordSearchModuleException ex) { } catch (KeywordSearchModuleException ex) {
logger.log(Level.WARNING, "Cannot start while restating", ex); logger.log(Level.WARNING, "Cannot start", ex);
} catch (SolrServerNoPortException ex) { } catch (SolrServerNoPortException ex) {
logger.log(Level.WARNING, "Cannot start server with this port", ex); logger.log(Level.WARNING, "Cannot start server with this port", ex);
} }
@ -411,7 +411,7 @@ class Ingester {
try { try {
solrServer.openCore(); solrServer.openCore();
} catch (KeywordSearchModuleException ex) { } catch (KeywordSearchModuleException ex) {
logger.log(Level.WARNING, "Cannot open core while restating", ex); logger.log(Level.WARNING, "Cannot open core", ex);
} }
} }

View File

@ -57,7 +57,7 @@ class KeywordSearchPanel extends javax.swing.JPanel {
/** /**
* @return the default instance KeywordSearchPanel * @return the default instance KeywordSearchPanel
*/ */
public static KeywordSearchPanel getDefault() { public synchronized static KeywordSearchPanel getDefault() {
if (instance == null) { if (instance == null) {
instance = new KeywordSearchPanel(); instance = new KeywordSearchPanel();
} }

View File

@ -57,7 +57,7 @@ public final class SearchRunner {
private static final Logger logger = Logger.getLogger(SearchRunner.class.getName()); private static final Logger logger = Logger.getLogger(SearchRunner.class.getName());
private static SearchRunner instance = null; private static SearchRunner instance = null;
private IngestServices services = IngestServices.getInstance(); private IngestServices services = IngestServices.getInstance();
private Ingester ingester = null; //guarded by "ingester" private Ingester ingester = null;
private volatile boolean updateTimerRunning = false; private volatile boolean updateTimerRunning = false;
private Timer updateTimer; private Timer updateTimer;
private Map<Long, SearchJobInfo> jobs = new HashMap<>(); //guarded by "this" private Map<Long, SearchJobInfo> jobs = new HashMap<>(); //guarded by "this"
@ -173,9 +173,7 @@ public final class SearchRunner {
* Commits index and notifies listeners of index update * Commits index and notifies listeners of index update
*/ */
private void commit() { private void commit() {
synchronized(ingester) { ingester.commit();
ingester.commit();
}
// Signal a potential change in number of text_ingested files // Signal a potential change in number of text_ingested files
try { try {