mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Add tika timeout to protect against tika spinning bugs
Uses single thread thread pool to minimize impact on performance/memory
This commit is contained in:
parent
a57d441a36
commit
29893c5dae
@ -27,6 +27,9 @@ import java.util.Collections;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
|
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
|
||||||
@ -35,6 +38,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||||
@ -107,35 +111,34 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
try {
|
try {
|
||||||
Metadata meta = new Metadata();
|
Metadata meta = new Metadata();
|
||||||
/* Tika parse request with timeout -- disabled for now
|
//Tika parse request with timeout
|
||||||
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
final Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
|
||||||
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
||||||
try {
|
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
||||||
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
|
||||||
} catch (TimeoutException te) {
|
|
||||||
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
|
|
||||||
logger.log(Level.WARNING, msg);
|
|
||||||
throw new IngesterException(msg);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
|
|
||||||
logger.log(Level.WARNING, msg, ex);
|
|
||||||
throw new IngesterException(msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
reader = parseTask.getReader();
|
|
||||||
*/
|
|
||||||
try {
|
try {
|
||||||
//Use new Tika instance for every file
|
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||||
//it does seem to protect against memory errors in Tika
|
} catch (TimeoutException te) {
|
||||||
//in contrast when reusing the same instance for many files
|
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||||
Tika tika = new Tika();
|
logger.log(Level.WARNING, msg);
|
||||||
reader = tika.parse(stream, meta);
|
throw new IngesterException(msg);
|
||||||
} catch (IOException ex) {
|
} catch (Exception ex) {
|
||||||
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||||
reader = null;
|
logger.log(Level.WARNING, msg, ex);
|
||||||
|
throw new IngesterException(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reader = parseTask.getReader();
|
||||||
|
|
||||||
|
/*
|
||||||
|
try {
|
||||||
|
//new tika instance for every file, to workaround tika memory issues
|
||||||
|
Tika tika = new Tika();
|
||||||
|
reader = tika.parse(stream, meta);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
||||||
|
reader = null;
|
||||||
|
}*/
|
||||||
|
|
||||||
if (reader == null) {
|
if (reader == null) {
|
||||||
//likely due to exception in parse()
|
//likely due to exception in parse()
|
||||||
logger.log(Level.WARNING, "No reader available from Tika parse");
|
logger.log(Level.WARNING, "No reader available from Tika parse");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user