mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-12 16:06:15 +00:00
Add local Tika extract timeout mechanism, similar to that used for Solr indexing
This commit is contained in:
parent
1fad291255
commit
ba518de7c8
@ -25,6 +25,11 @@ import java.nio.charset.Charset;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
|
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
|
||||||
@ -33,13 +38,15 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
|
import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||||
* divided into chunks and indexed with Solr.
|
* divided into chunks and indexed with Solr.
|
||||||
|
* Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
||||||
|
* If Tika extraction succeeds, chunks are indexed with Solr.
|
||||||
*
|
*
|
||||||
* This is especially useful for large content of supported type that is to be
|
* This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type.
|
||||||
* divided into text chunks and indexed as such.
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||||
@ -57,6 +64,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
private AbstractFile sourceFile;
|
private AbstractFile sourceFile;
|
||||||
private int numChunks = 0;
|
private int numChunks = 0;
|
||||||
private static final String UTF16BOM = "\uFEFF";
|
private static final String UTF16BOM = "\uFEFF";
|
||||||
|
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||||
|
|
||||||
AbstractFileTikaTextExtract(AbstractFile sourceFile) {
|
AbstractFileTikaTextExtract(AbstractFile sourceFile) {
|
||||||
this.sourceFile = sourceFile;
|
this.sourceFile = sourceFile;
|
||||||
@ -81,10 +89,27 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
boolean success = false;
|
boolean success = false;
|
||||||
Reader reader = null;
|
Reader reader = null;
|
||||||
|
|
||||||
|
|
||||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
try {
|
try {
|
||||||
Metadata meta = new Metadata();
|
Metadata meta = new Metadata();
|
||||||
reader = tika.parse(stream, meta);
|
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
||||||
|
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
||||||
|
try {
|
||||||
|
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||||
|
} catch (TimeoutException te) {
|
||||||
|
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||||
|
logger.log(Level.WARNING, msg);
|
||||||
|
throw new IngesterException(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
reader = parseTask.getReader();
|
||||||
|
if (reader == null) {
|
||||||
|
//likely due to exception in parse()
|
||||||
|
logger.log(Level.WARNING, "No reader available from Tika parse");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
success = true;
|
success = true;
|
||||||
long readSize;
|
long readSize;
|
||||||
long totalRead = 0;
|
long totalRead = 0;
|
||||||
@ -189,4 +214,39 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
|
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runnable and timeable task that calls tika to parse the content using streaming
|
||||||
|
*/
|
||||||
|
private static class ParseRequestTask implements Runnable {
|
||||||
|
|
||||||
|
//in
|
||||||
|
private Tika tika;
|
||||||
|
private InputStream stream;
|
||||||
|
private Metadata meta;
|
||||||
|
private AbstractFile sourceFile;
|
||||||
|
//out
|
||||||
|
private Reader reader;
|
||||||
|
|
||||||
|
ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
|
||||||
|
this.tika = tika;
|
||||||
|
this.stream = stream;
|
||||||
|
this.meta = meta;
|
||||||
|
this.sourceFile = sourceFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
try {
|
||||||
|
reader = tika.parse(stream, meta);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
||||||
|
reader = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader() {
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -272,7 +272,7 @@ public class Ingester {
|
|||||||
* @param size size of the content
|
* @param size size of the content
|
||||||
* @return time in seconds to use a timeout
|
* @return time in seconds to use a timeout
|
||||||
*/
|
*/
|
||||||
private static int getTimeout(long size) {
|
static int getTimeout(long size) {
|
||||||
if (size < 1024 * 1024L) //1MB
|
if (size < 1024 * 1024L) //1MB
|
||||||
{
|
{
|
||||||
return 60;
|
return 60;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user