Add local Tika extract timeout mechanism, similar to that used for Solr indexing

This commit is contained in:
adam-m 2012-07-18 15:06:53 -04:00
parent 1fad291255
commit ba518de7c8
2 changed files with 64 additions and 4 deletions

View File

@ -25,6 +25,11 @@ import java.nio.charset.Charset;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile; import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
@ -33,13 +38,15 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding; import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
/** /**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is * Extractor of text from TIKA supported AbstractFile content. Extracted text is
* divided into chunks and indexed with Solr. * divided into chunks and indexed with Solr.
* Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism.
* If Tika extraction succeeds, chunks are indexed with Solr.
* *
* This is especially useful for large content of supported type that is to be * This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type.
* divided into text chunks and indexed as such.
* *
*/ */
public class AbstractFileTikaTextExtract implements AbstractFileExtract { public class AbstractFileTikaTextExtract implements AbstractFileExtract {
@ -57,6 +64,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
private AbstractFile sourceFile; private AbstractFile sourceFile;
private int numChunks = 0; private int numChunks = 0;
private static final String UTF16BOM = "\uFEFF"; private static final String UTF16BOM = "\uFEFF";
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
AbstractFileTikaTextExtract(AbstractFile sourceFile) { AbstractFileTikaTextExtract(AbstractFile sourceFile) {
this.sourceFile = sourceFile; this.sourceFile = sourceFile;
@ -81,10 +89,27 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
boolean success = false; boolean success = false;
Reader reader = null; Reader reader = null;
final InputStream stream = new ReadContentInputStream(sourceFile); final InputStream stream = new ReadContentInputStream(sourceFile);
try { try {
Metadata meta = new Metadata(); Metadata meta = new Metadata();
reader = tika.parse(stream, meta); ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
final Future<?> future = tikaParseExecutor.submit(parseTask);
try {
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
} catch (TimeoutException te) {
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
logger.log(Level.WARNING, msg);
throw new IngesterException(msg);
}
reader = parseTask.getReader();
if (reader == null) {
//likely due to exception in parse()
logger.log(Level.WARNING, "No reader available from Tika parse");
return false;
}
success = true; success = true;
long readSize; long readSize;
long totalRead = 0; long totalRead = 0;
@ -189,4 +214,39 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
return success; return success;
} }
/**
* Runnable and timeable task that calls tika to parse the content using streaming
*/
private static class ParseRequestTask implements Runnable {
//in
private Tika tika;
private InputStream stream;
private Metadata meta;
private AbstractFile sourceFile;
//out
private Reader reader;
ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
this.tika = tika;
this.stream = stream;
this.meta = meta;
this.sourceFile = sourceFile;
}
@Override
public void run() {
try {
reader = tika.parse(stream, meta);
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
reader = null;
}
}
public Reader getReader() {
return reader;
}
}
} }

View File

@ -272,7 +272,7 @@ public class Ingester {
* @param size size of the content * @param size size of the content
* @return time in seconds to use a timeout * @return time in seconds to use a timeout
*/ */
private static int getTimeout(long size) { static int getTimeout(long size) {
if (size < 1024 * 1024L) //1MB if (size < 1024 * 1024L) //1MB
{ {
return 60; return 60;