diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java index 4e03038b39..0367397137 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; +import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -48,7 +49,6 @@ class HtmlTextExtractor implements TextExtractor { private static final int MAX_SIZE = 50000000; //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; - private KeywordSearchIngestModule module; private AbstractFile sourceFile; private int numChunks = 0; @@ -63,8 +63,7 @@ class HtmlTextExtractor implements TextExtractor { //"application/xml-dtd", ); - HtmlTextExtractor(KeywordSearchIngestModule module) { - this.module = module; + HtmlTextExtractor() { ingester = Server.getIngester(); } @@ -98,7 +97,7 @@ class HtmlTextExtractor implements TextExtractor { } @Override - public boolean index(AbstractFile sourceFile) throws IngesterException { + public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException { this.sourceFile = sourceFile; numChunks = 0; //unknown until indexing is done diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index 10f43b4534..5795368e3c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -206,14 +206,14 @@ public final class KeywordSearchIngestModule implements FileIngestModule { } //initialize extractors - stringExtractor = new StringsTextExtractor(this); + stringExtractor = new StringsTextExtractor(); stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts()); stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions()); textExtractors = new ArrayList<>(); //order matters, more specific extractors first - textExtractors.add(new HtmlTextExtractor(this)); - textExtractors.add(new TikaTextExtractor(this)); + textExtractors.add(new HtmlTextExtractor()); + textExtractors.add(new TikaTextExtractor()); indexer = new Indexer(); initialized = true; @@ -417,7 +417,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName()); //divide into chunks and index - return fileExtract.index(aFile); + return fileExtract.index(aFile, context); } /** @@ -496,9 +496,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule { return; } - String detectedFormat; + String fileType; try { - detectedFormat = fileTypeDetector.getFileType(aFile); + fileType = fileTypeDetector.getFileType(aFile); } catch (TskCoreException ex) { logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS return; @@ -506,7 +506,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { // we skip archive formats that are opened by the archive module. // @@@ We could have a check here to see if the archive module was enabled though... - if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) { + if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) { try { ingester.ingest(aFile, false); //meta-data only putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED); @@ -518,11 +518,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule { } boolean wasTextAdded = false; - if (isTextExtractSupported(aFile, detectedFormat)) { + if (isTextExtractSupported(aFile, fileType)) { //extract text with one of the extractors, divide into chunks and index with Solr try { //logger.log(Level.INFO, "indexing: " + aFile.getName()); - if (!extractTextAndIndex(aFile, detectedFormat)) { + if (!extractTextAndIndex(aFile, fileType)) { logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT); } else { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java index 9b0fd107d0..ce4eeff10f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java @@ -45,7 +45,6 @@ class StringsTextExtractor implements TextExtractor { private static final int BOM_LEN = 0; //disabled prepending of BOM private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2; - private KeywordSearchIngestModule module; private AbstractFile sourceFile; private int numChunks = 0; private final List