From ba518de7c87d55009f3ef24b74ee3b4f7e4d677c Mon Sep 17 00:00:00 2001 From: adam-m Date: Wed, 18 Jul 2012 15:06:53 -0400 Subject: [PATCH] Add local Tika extract timeout mechanism, similar to that used for Solr indexing --- .../AbstractFileTikaTextExtract.java | 66 ++++++++++++++++++- .../autopsy/keywordsearch/Ingester.java | 2 +- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java index 4ad9d8b581..5d9e6b0700 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java @@ -25,6 +25,11 @@ import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.logging.Level; import java.util.logging.Logger; import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile; @@ -33,13 +38,15 @@ import org.sleuthkit.datamodel.ReadContentInputStream; import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding; +import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; /** * Extractor of text from TIKA supported AbstractFile content. Extracted text is * divided into chunks and indexed with Solr. + * Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism. + * If Tika extraction succeeds, chunks are indexed with Solr. * - * This is especially useful for large content of supported type that is to be - * divided into text chunks and indexed as such. + * This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type. * */ public class AbstractFileTikaTextExtract implements AbstractFileExtract { @@ -57,6 +64,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { private AbstractFile sourceFile; private int numChunks = 0; private static final String UTF16BOM = "\uFEFF"; + private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); AbstractFileTikaTextExtract(AbstractFile sourceFile) { this.sourceFile = sourceFile; @@ -81,10 +89,27 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { boolean success = false; Reader reader = null; + final InputStream stream = new ReadContentInputStream(sourceFile); try { Metadata meta = new Metadata(); - reader = tika.parse(stream, meta); + ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile); + final Future future = tikaParseExecutor.submit(parseTask); + try { + future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); + } catch (TimeoutException te) { + final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName(); + logger.log(Level.WARNING, msg); + throw new IngesterException(msg); + } + + reader = parseTask.getReader(); + if (reader == null) { + //likely due to exception in parse() + logger.log(Level.WARNING, "No reader available from Tika parse"); + return false; + } + success = true; long readSize; long totalRead = 0; @@ -189,4 +214,39 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { return success; } + + /** + * Runnable and timeable task that calls tika to parse the content using streaming + */ + private static class ParseRequestTask implements Runnable { + + //in + private Tika tika; + private InputStream stream; + private Metadata meta; + private AbstractFile sourceFile; + //out + private Reader reader; + + ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) { + this.tika = tika; + this.stream = stream; + this.meta = meta; + this.sourceFile = sourceFile; + } + + @Override + public void run() { + try { + reader = tika.parse(stream, meta); + } catch (IOException ex) { + logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); + reader = null; + } + } + + public Reader getReader() { + return reader; + } + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index a1481e27f7..ec8b96dd1b 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -272,7 +272,7 @@ public class Ingester { * @param size size of the content * @return time in seconds to use a timeout */ - private static int getTimeout(long size) { + static int getTimeout(long size) { if (size < 1024 * 1024L) //1MB { return 60;