Worked towards cancellation policy.

This commit is contained in:
Oliver Spohngellert 2016-05-17 13:47:16 -04:00
parent 2754c50395
commit 738c446d1c
4 changed files with 28 additions and 25 deletions

View File

@ -28,6 +28,7 @@ import java.util.Map;
import java.util.logging.Level; import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
@ -48,7 +49,6 @@ class HtmlTextExtractor implements TextExtractor {
private static final int MAX_SIZE = 50000000; private static final int MAX_SIZE = 50000000;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private KeywordSearchIngestModule module;
private AbstractFile sourceFile; private AbstractFile sourceFile;
private int numChunks = 0; private int numChunks = 0;
@ -63,8 +63,7 @@ class HtmlTextExtractor implements TextExtractor {
//"application/xml-dtd", //"application/xml-dtd",
); );
HtmlTextExtractor(KeywordSearchIngestModule module) { HtmlTextExtractor() {
this.module = module;
ingester = Server.getIngester(); ingester = Server.getIngester();
} }
@ -98,7 +97,7 @@ class HtmlTextExtractor implements TextExtractor {
} }
@Override @Override
public boolean index(AbstractFile sourceFile) throws IngesterException { public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
this.sourceFile = sourceFile; this.sourceFile = sourceFile;
numChunks = 0; //unknown until indexing is done numChunks = 0; //unknown until indexing is done

View File

@ -206,14 +206,14 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
} }
//initialize extractors //initialize extractors
stringExtractor = new StringsTextExtractor(this); stringExtractor = new StringsTextExtractor();
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts()); stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions()); stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
textExtractors = new ArrayList<>(); textExtractors = new ArrayList<>();
//order matters, more specific extractors first //order matters, more specific extractors first
textExtractors.add(new HtmlTextExtractor(this)); textExtractors.add(new HtmlTextExtractor());
textExtractors.add(new TikaTextExtractor(this)); textExtractors.add(new TikaTextExtractor());
indexer = new Indexer(); indexer = new Indexer();
initialized = true; initialized = true;
@ -417,7 +417,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName()); //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
//divide into chunks and index //divide into chunks and index
return fileExtract.index(aFile); return fileExtract.index(aFile, context);
} }
/** /**
@ -496,9 +496,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
return; return;
} }
String detectedFormat; String fileType;
try { try {
detectedFormat = fileTypeDetector.getFileType(aFile); fileType = fileTypeDetector.getFileType(aFile);
} catch (TskCoreException ex) { } catch (TskCoreException ex) {
logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
return; return;
@ -506,7 +506,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
// we skip archive formats that are opened by the archive module. // we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though... // @@@ We could have a check here to see if the archive module was enabled though...
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) { if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
try { try {
ingester.ingest(aFile, false); //meta-data only ingester.ingest(aFile, false); //meta-data only
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED); putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
@ -518,11 +518,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
} }
boolean wasTextAdded = false; boolean wasTextAdded = false;
if (isTextExtractSupported(aFile, detectedFormat)) { if (isTextExtractSupported(aFile, fileType)) {
//extract text with one of the extractors, divide into chunks and index with Solr //extract text with one of the extractors, divide into chunks and index with Solr
try { try {
//logger.log(Level.INFO, "indexing: " + aFile.getName()); //logger.log(Level.INFO, "indexing: " + aFile.getName());
if (!extractTextAndIndex(aFile, detectedFormat)) { if (!extractTextAndIndex(aFile, fileType)) {
logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT); putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
} else { } else {

View File

@ -45,7 +45,6 @@ class StringsTextExtractor implements TextExtractor {
private static final int BOM_LEN = 0; //disabled prepending of BOM private static final int BOM_LEN = 0; //disabled prepending of BOM
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2; private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
private KeywordSearchIngestModule module;
private AbstractFile sourceFile; private AbstractFile sourceFile;
private int numChunks = 0; private int numChunks = 0;
private final List<SCRIPT> extractScripts = new ArrayList<>(); private final List<SCRIPT> extractScripts = new ArrayList<>();
@ -58,8 +57,7 @@ class StringsTextExtractor implements TextExtractor {
//stringChunkBuf[1] = (byte) 0xBB; //stringChunkBuf[1] = (byte) 0xBB;
//stringChunkBuf[2] = (byte) 0xBF; //stringChunkBuf[2] = (byte) 0xBF;
//} //}
public StringsTextExtractor(KeywordSearchIngestModule module) { public StringsTextExtractor() {
this.module = module;
ingester = Server.getIngester(); ingester = Server.getIngester();
extractScripts.add(DEFAULT_SCRIPT); extractScripts.add(DEFAULT_SCRIPT);
} }
@ -130,6 +128,14 @@ class StringsTextExtractor implements TextExtractor {
final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE]; final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
long readSize; long readSize;
while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) { while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
if (context.fileIngestIsCancelled()) {
try {
stringStream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
}
return false;
}
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1)); //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
//debug.write(stringChunkBuf, 0, (int)readSize); //debug.write(stringChunkBuf, 0, (int)readSize);

View File

@ -34,17 +34,17 @@ import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.logging.Level; import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseContext;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/** /**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is * Extractor of text from TIKA supported AbstractFile content. Extracted text is
@ -65,14 +65,12 @@ class TikaTextExtractor implements TextExtractor {
private static final int SINGLE_READ_CHARS = 1024; private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace private static final int EXTRA_CHARS = 128; //for whitespace
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private final KeywordSearchIngestModule module;
private AbstractFile sourceFile; //currently processed file private AbstractFile sourceFile; //currently processed file
private int numChunks = 0; private int numChunks = 0;
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>(); private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
TikaTextExtractor(KeywordSearchIngestModule module) { TikaTextExtractor() {
this.module = module;
ingester = Server.getIngester(); ingester = Server.getIngester();
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext()); Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
@ -112,7 +110,7 @@ class TikaTextExtractor implements TextExtractor {
} }
@Override @Override
public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException { public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
this.sourceFile = sourceFile; this.sourceFile = sourceFile;
numChunks = 0; //unknown until indexing is done numChunks = 0; //unknown until indexing is done