Worked towards cancellation policy.

This commit is contained in:
Oliver Spohngellert 2016-05-17 13:47:16 -04:00
parent 2754c50395
commit 738c446d1c
4 changed files with 28 additions and 25 deletions

View File

@ -28,6 +28,7 @@ import java.util.Map;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
@ -48,7 +49,6 @@ class HtmlTextExtractor implements TextExtractor {
private static final int MAX_SIZE = 50000000;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private KeywordSearchIngestModule module;
private AbstractFile sourceFile;
private int numChunks = 0;
@ -63,8 +63,7 @@ class HtmlTextExtractor implements TextExtractor {
//"application/xml-dtd",
);
HtmlTextExtractor(KeywordSearchIngestModule module) {
this.module = module;
HtmlTextExtractor() {
ingester = Server.getIngester();
}
@ -98,7 +97,7 @@ class HtmlTextExtractor implements TextExtractor {
}
@Override
public boolean index(AbstractFile sourceFile) throws IngesterException {
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
this.sourceFile = sourceFile;
numChunks = 0; //unknown until indexing is done

View File

@ -206,14 +206,14 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
}
//initialize extractors
stringExtractor = new StringsTextExtractor(this);
stringExtractor = new StringsTextExtractor();
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
textExtractors = new ArrayList<>();
//order matters, more specific extractors first
textExtractors.add(new HtmlTextExtractor(this));
textExtractors.add(new TikaTextExtractor(this));
textExtractors.add(new HtmlTextExtractor());
textExtractors.add(new TikaTextExtractor());
indexer = new Indexer();
initialized = true;
@ -417,7 +417,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
//divide into chunks and index
return fileExtract.index(aFile);
return fileExtract.index(aFile, context);
}
/**
@ -496,9 +496,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
return;
}
String detectedFormat;
String fileType;
try {
detectedFormat = fileTypeDetector.getFileType(aFile);
fileType = fileTypeDetector.getFileType(aFile);
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
return;
@ -506,7 +506,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
// we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though...
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
try {
ingester.ingest(aFile, false); //meta-data only
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
@ -518,11 +518,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
}
boolean wasTextAdded = false;
if (isTextExtractSupported(aFile, detectedFormat)) {
if (isTextExtractSupported(aFile, fileType)) {
//extract text with one of the extractors, divide into chunks and index with Solr
try {
//logger.log(Level.INFO, "indexing: " + aFile.getName());
if (!extractTextAndIndex(aFile, detectedFormat)) {
if (!extractTextAndIndex(aFile, fileType)) {
logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
} else {

View File

@ -45,7 +45,6 @@ class StringsTextExtractor implements TextExtractor {
private static final int BOM_LEN = 0; //disabled prepending of BOM
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
private KeywordSearchIngestModule module;
private AbstractFile sourceFile;
private int numChunks = 0;
private final List<SCRIPT> extractScripts = new ArrayList<>();
@ -58,8 +57,7 @@ class StringsTextExtractor implements TextExtractor {
//stringChunkBuf[1] = (byte) 0xBB;
//stringChunkBuf[2] = (byte) 0xBF;
//}
public StringsTextExtractor(KeywordSearchIngestModule module) {
this.module = module;
public StringsTextExtractor() {
ingester = Server.getIngester();
extractScripts.add(DEFAULT_SCRIPT);
}
@ -130,6 +128,14 @@ class StringsTextExtractor implements TextExtractor {
final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
long readSize;
while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
if (context.fileIngestIsCancelled()) {
try {
stringStream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
}
return false;
}
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
//debug.write(stringChunkBuf, 0, (int)readSize);

View File

@ -34,17 +34,17 @@ import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
@ -65,14 +65,12 @@ class TikaTextExtractor implements TextExtractor {
private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private final KeywordSearchIngestModule module;
private AbstractFile sourceFile; //currently processed file
private int numChunks = 0;
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
TikaTextExtractor(KeywordSearchIngestModule module) {
this.module = module;
TikaTextExtractor() {
ingester = Server.getIngester();
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
@ -112,7 +110,7 @@ class TikaTextExtractor implements TextExtractor {
}
@Override
public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
this.sourceFile = sourceFile;
numChunks = 0; //unknown until indexing is done