From c11d571c3f17161e8cbff9a8e396e89c04cac311 Mon Sep 17 00:00:00 2001 From: "Samuel H. Kenyon" Date: Thu, 10 Apr 2014 13:56:49 -0400 Subject: [PATCH 1/6] removed lock around call to ingester.commit() --- .../org/sleuthkit/autopsy/keywordsearch/SearchRunner.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SearchRunner.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SearchRunner.java index 813c0fd8dd..7d10a7cf43 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SearchRunner.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SearchRunner.java @@ -57,7 +57,7 @@ public final class SearchRunner { private static final Logger logger = Logger.getLogger(SearchRunner.class.getName()); private static SearchRunner instance = null; private IngestServices services = IngestServices.getInstance(); - private Ingester ingester = null; //guarded by "ingester" + private Ingester ingester = null; private volatile boolean updateTimerRunning = false; private Timer updateTimer; private Map jobs = new HashMap<>(); //guarded by "this" @@ -173,9 +173,7 @@ public final class SearchRunner { * Commits index and notifies listeners of index update */ private void commit() { - synchronized(ingester) { - ingester.commit(); - } + ingester.commit(); // Signal a potential change in number of text_ingested files try { From cb3055015f0c57549551e80231ccb6f91f6abb0c Mon Sep 17 00:00:00 2001 From: "Samuel H. Kenyon" Date: Thu, 10 Apr 2014 17:03:41 -0400 Subject: [PATCH 2/6] log message changes --- .../src/org/sleuthkit/autopsy/keywordsearch/Ingester.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index bdb304dabf..b772c9e5bf 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -395,7 +395,7 @@ class Ingester { try { solrServer.closeCore(); } catch (KeywordSearchModuleException ex) { - logger.log(Level.WARNING, "Cannot close core while restating", ex); + logger.log(Level.WARNING, "Cannot close core", ex); } solrServer.stop(); @@ -403,7 +403,7 @@ class Ingester { try { solrServer.start(); } catch (KeywordSearchModuleException ex) { - logger.log(Level.WARNING, "Cannot start while restating", ex); + logger.log(Level.WARNING, "Cannot start", ex); } catch (SolrServerNoPortException ex) { logger.log(Level.WARNING, "Cannot start server with this port", ex); } @@ -411,7 +411,7 @@ class Ingester { try { solrServer.openCore(); } catch (KeywordSearchModuleException ex) { - logger.log(Level.WARNING, "Cannot open core while restating", ex); + logger.log(Level.WARNING, "Cannot open core", ex); } } From 4ce2cf5fe7c01b330ec9bbf42060352f78cb7e04 Mon Sep 17 00:00:00 2001 From: "Samuel H. Kenyon" Date: Tue, 15 Apr 2014 14:02:36 -0400 Subject: [PATCH 3/6] updated AbstractFile text extractor classes to be thread safe --- .../AbstractFileHtmlExtract.java | 25 +++++++------- .../AbstractFileStringExtract.java | 34 +++++++++---------- .../AbstractFileTikaTextExtract.java | 29 ++++++++-------- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java index 7af85f7e3f..38a5a51e71 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java @@ -40,17 +40,18 @@ import org.sleuthkit.datamodel.ReadContentInputStream; class AbstractFileHtmlExtract implements AbstractFileExtract { private static final Logger logger = Logger.getLogger(AbstractFileHtmlExtract.class.getName()); + private static Ingester ingester; static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET; static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; private static final int SINGLE_READ_CHARS = 1024; - private static final int EXTRA_CHARS = 128; //for whitespace - private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS]; + private static final int EXTRA_CHARS = 128; //for whitespace private static final int MAX_SIZE = 50000000; - private KeywordSearchIngestModule module; - private Ingester ingester; + //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM + private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; + private KeywordSearchIngestModule module; private AbstractFile sourceFile; private int numChunks = 0; - //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM + static final List WEB_MIME_TYPES = Arrays.asList( "application/javascript", "application/xhtml+xml", @@ -98,7 +99,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream; @Override public boolean index(AbstractFile sourceFile) throws IngesterException { this.sourceFile = sourceFile; - this.numChunks = 0; //unknown until indexing is done + numChunks = 0; //unknown until indexing is done boolean success = false; Reader reader = null; @@ -122,12 +123,12 @@ import org.sleuthkit.datamodel.ReadContentInputStream; long totalRead = 0; boolean eof = false; //we read max 1024 chars at time, this seems to max what this Reader would return - while (!eof && (readSize = reader.read(TEXT_CHUNK_BUF, 0, SINGLE_READ_CHARS)) != -1) { + while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) { totalRead += readSize; //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word) while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) - && (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, SINGLE_READ_CHARS)) != -1) { + && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) { totalRead += readSize; } if (readSize == -1) { @@ -136,8 +137,8 @@ import org.sleuthkit.datamodel.ReadContentInputStream; } else { //try to read until whitespace to not break words while ((totalRead < MAX_EXTR_TEXT_CHARS - 1) - && !Character.isWhitespace(TEXT_CHUNK_BUF[(int) totalRead - 1]) - && (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, 1)) != -1) { + && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1]) + && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) { totalRead += readSize; } if (readSize == -1) { @@ -156,9 +157,9 @@ import org.sleuthkit.datamodel.ReadContentInputStream; //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika if (totalRead < MAX_EXTR_TEXT_CHARS) { - sb.append(TEXT_CHUNK_BUF, 0, (int) totalRead); + sb.append(textChunkBuf, 0, (int) totalRead); } else { - sb.append(TEXT_CHUNK_BUF); + sb.append(textChunkBuf); } //reset for next chunk diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java index 6f96ff2495..17b419ffab 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java @@ -36,34 +36,32 @@ import org.sleuthkit.datamodel.AbstractFile; * the original source file) up to 1MB then and indexes chunks as text with Solr */ class AbstractFileStringExtract implements AbstractFileExtract { - - private KeywordSearchIngestModule module; - private Ingester ingester; - private int numChunks; + + private static Ingester ingester; private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName()); - static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; - private AbstractFile sourceFile; - //single static buffer for all extractions. Safe, indexing can only happen in one thread - private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE]; + private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; //private static final int BOM_LEN = 3; private static final int BOM_LEN = 0; //disabled prepending of BOM private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2; + private final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE]; + private KeywordSearchIngestModule module; + private AbstractFile sourceFile; + private int numChunks = 0; private final List