Worked towards cancellation policy.

2025-07-17 02:07:42 +00:00 · 2016-05-17 13:47:16 -04:00 · 2016-05-17 13:47:16 -04:00 · 738c446d1c
commit 738c446d1c
parent 2754c50395
4 changed files with 28 additions and 25 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java
@ -28,6 +28,7 @@ import java.util.Map;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
@ -48,7 +49,6 @@ class HtmlTextExtractor implements TextExtractor {
    private static final int MAX_SIZE = 50000000;
    //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
    private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
    private KeywordSearchIngestModule module;
    private AbstractFile sourceFile;
    private int numChunks = 0;
@ -63,8 +63,7 @@ class HtmlTextExtractor implements TextExtractor {
    //"application/xml-dtd",
    );
-    HtmlTextExtractor(KeywordSearchIngestModule module) {
+    HtmlTextExtractor() {
        this.module = module;
        ingester = Server.getIngester();
    }
@ -98,7 +97,7 @@ class HtmlTextExtractor implements TextExtractor {
    }
    @Override
-    public boolean index(AbstractFile sourceFile) throws IngesterException {
+    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
        this.sourceFile = sourceFile;
        numChunks = 0; //unknown until indexing is done
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -206,14 +206,14 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
        }
        //initialize extractors
-        stringExtractor = new StringsTextExtractor(this);
+        stringExtractor = new StringsTextExtractor();
        stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
        stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
        textExtractors = new ArrayList<>();
        //order matters, more specific extractors first
-        textExtractors.add(new HtmlTextExtractor(this));
+        textExtractors.add(new HtmlTextExtractor());
-        textExtractors.add(new TikaTextExtractor(this));
+        textExtractors.add(new TikaTextExtractor());
        indexer = new Indexer();
        initialized = true;
@ -417,7 +417,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
            //divide into chunks and index
-            return fileExtract.index(aFile);
+            return fileExtract.index(aFile, context);
        }
        /**
@ -496,9 +496,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                return;
            }
-            String detectedFormat;
+            String fileType;
            try {
-                detectedFormat = fileTypeDetector.getFileType(aFile);
+                fileType = fileTypeDetector.getFileType(aFile);
            } catch (TskCoreException ex) {
                logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
                return;
@ -506,7 +506,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            // we skip archive formats that are opened by the archive module. 
            // @@@ We could have a check here to see if the archive module was enabled though...
-            if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
+            if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
                try {
                    ingester.ingest(aFile, false); //meta-data only
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
@ -518,11 +518,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            }
            boolean wasTextAdded = false;
-            if (isTextExtractSupported(aFile, detectedFormat)) {
+            if (isTextExtractSupported(aFile, fileType)) {
                //extract text with one of the extractors, divide into chunks and index with Solr
                try {
                    //logger.log(Level.INFO, "indexing: " + aFile.getName());
-                    if (!extractTextAndIndex(aFile, detectedFormat)) {
+                    if (!extractTextAndIndex(aFile, fileType)) {
                        logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
                        putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
                    } else {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java
@ -45,7 +45,6 @@ class StringsTextExtractor implements TextExtractor {
    private static final int BOM_LEN = 0;  //disabled prepending of BOM
    private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
    private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
    private KeywordSearchIngestModule module;
    private AbstractFile sourceFile;
    private int numChunks = 0;
    private final List<SCRIPT> extractScripts = new ArrayList<>();
@ -58,8 +57,7 @@ class StringsTextExtractor implements TextExtractor {
    //stringChunkBuf[1] = (byte) 0xBB;
    //stringChunkBuf[2] = (byte) 0xBF;
    //}
-    public StringsTextExtractor(KeywordSearchIngestModule module) {
+    public StringsTextExtractor() {
        this.module = module;
        ingester = Server.getIngester();
        extractScripts.add(DEFAULT_SCRIPT);
    }
@ -130,6 +128,14 @@ class StringsTextExtractor implements TextExtractor {
            final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
            long readSize;
            while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
                if (context.fileIngestIsCancelled()) {
                    try {
                        stringStream.close();
                    } catch (IOException ex) {
                        logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
                    }
                    return false;
                }
                //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
                //debug.write(stringChunkBuf, 0, (int)readSize);
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java
@ -34,17 +34,17 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 /**
 * Extractor of text from TIKA supported AbstractFile content. Extracted text is
@ -65,14 +65,12 @@ class TikaTextExtractor implements TextExtractor {
    private static final int SINGLE_READ_CHARS = 1024;
    private static final int EXTRA_CHARS = 128; //for whitespace
    private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
    private final KeywordSearchIngestModule module;
    private AbstractFile sourceFile; //currently processed file
    private int numChunks = 0;
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
    private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
-    TikaTextExtractor(KeywordSearchIngestModule module) {
+    TikaTextExtractor() {
        this.module = module;
        ingester = Server.getIngester();
        Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
@ -112,7 +110,7 @@ class TikaTextExtractor implements TextExtractor {
    }
    @Override
-    public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
+    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
        this.sourceFile = sourceFile;
        numChunks = 0; //unknown until indexing is done