Add tika timeout to protect against tika spinning bugs

Uses single thread thread pool to minimize impact on performance/memory
This commit is contained in:
adam-m 2012-08-02 11:59:02 -04:00
parent a57d441a36
commit 29893c5dae

View File

@ -27,6 +27,9 @@ import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile; import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
@ -35,6 +38,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
/** /**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is * Extractor of text from TIKA supported AbstractFile content. Extracted text is
@ -107,35 +111,34 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
final InputStream stream = new ReadContentInputStream(sourceFile); final InputStream stream = new ReadContentInputStream(sourceFile);
try { try {
Metadata meta = new Metadata(); Metadata meta = new Metadata();
/* Tika parse request with timeout -- disabled for now //Tika parse request with timeout
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile); final Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
final Future<?> future = tikaParseExecutor.submit(parseTask); ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
try { final Future<?> future = tikaParseExecutor.submit(parseTask);
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
} catch (TimeoutException te) {
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
logger.log(Level.WARNING, msg);
throw new IngesterException(msg);
}
catch (Exception ex) {
final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
logger.log(Level.WARNING, msg, ex);
throw new IngesterException(msg);
}
reader = parseTask.getReader();
*/
try { try {
//Use new Tika instance for every file future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
//it does seem to protect against memory errors in Tika } catch (TimeoutException te) {
//in contrast when reusing the same instance for many files final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
Tika tika = new Tika(); logger.log(Level.WARNING, msg);
reader = tika.parse(stream, meta); throw new IngesterException(msg);
} catch (IOException ex) { } catch (Exception ex) {
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
reader = null; logger.log(Level.WARNING, msg, ex);
throw new IngesterException(msg);
} }
reader = parseTask.getReader();
/*
try {
//new tika instance for every file, to workaround tika memory issues
Tika tika = new Tika();
reader = tika.parse(stream, meta);
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
reader = null;
}*/
if (reader == null) { if (reader == null) {
//likely due to exception in parse() //likely due to exception in parse()
logger.log(Level.WARNING, "No reader available from Tika parse"); logger.log(Level.WARNING, "No reader available from Tika parse");