mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 02:07:42 +00:00
Worked towards cancellation policy.
This commit is contained in:
parent
2754c50395
commit
738c446d1c
@ -28,6 +28,7 @@ import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
@ -48,7 +49,6 @@ class HtmlTextExtractor implements TextExtractor {
|
||||
private static final int MAX_SIZE = 50000000;
|
||||
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
|
||||
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
private KeywordSearchIngestModule module;
|
||||
private AbstractFile sourceFile;
|
||||
private int numChunks = 0;
|
||||
|
||||
@ -63,8 +63,7 @@ class HtmlTextExtractor implements TextExtractor {
|
||||
//"application/xml-dtd",
|
||||
);
|
||||
|
||||
HtmlTextExtractor(KeywordSearchIngestModule module) {
|
||||
this.module = module;
|
||||
HtmlTextExtractor() {
|
||||
ingester = Server.getIngester();
|
||||
}
|
||||
|
||||
@ -98,7 +97,7 @@ class HtmlTextExtractor implements TextExtractor {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index(AbstractFile sourceFile) throws IngesterException {
|
||||
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
numChunks = 0; //unknown until indexing is done
|
||||
|
||||
|
@ -206,14 +206,14 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
}
|
||||
|
||||
//initialize extractors
|
||||
stringExtractor = new StringsTextExtractor(this);
|
||||
stringExtractor = new StringsTextExtractor();
|
||||
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
|
||||
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
|
||||
|
||||
textExtractors = new ArrayList<>();
|
||||
//order matters, more specific extractors first
|
||||
textExtractors.add(new HtmlTextExtractor(this));
|
||||
textExtractors.add(new TikaTextExtractor(this));
|
||||
textExtractors.add(new HtmlTextExtractor());
|
||||
textExtractors.add(new TikaTextExtractor());
|
||||
|
||||
indexer = new Indexer();
|
||||
initialized = true;
|
||||
@ -417,7 +417,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
|
||||
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
||||
//divide into chunks and index
|
||||
return fileExtract.index(aFile);
|
||||
return fileExtract.index(aFile, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -496,9 +496,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
return;
|
||||
}
|
||||
|
||||
String detectedFormat;
|
||||
String fileType;
|
||||
try {
|
||||
detectedFormat = fileTypeDetector.getFileType(aFile);
|
||||
fileType = fileTypeDetector.getFileType(aFile);
|
||||
} catch (TskCoreException ex) {
|
||||
logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
|
||||
return;
|
||||
@ -506,7 +506,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
|
||||
// we skip archive formats that are opened by the archive module.
|
||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
|
||||
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||
try {
|
||||
ingester.ingest(aFile, false); //meta-data only
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
|
||||
@ -518,11 +518,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
}
|
||||
|
||||
boolean wasTextAdded = false;
|
||||
if (isTextExtractSupported(aFile, detectedFormat)) {
|
||||
if (isTextExtractSupported(aFile, fileType)) {
|
||||
//extract text with one of the extractors, divide into chunks and index with Solr
|
||||
try {
|
||||
//logger.log(Level.INFO, "indexing: " + aFile.getName());
|
||||
if (!extractTextAndIndex(aFile, detectedFormat)) {
|
||||
if (!extractTextAndIndex(aFile, fileType)) {
|
||||
logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
|
||||
} else {
|
||||
|
@ -45,7 +45,6 @@ class StringsTextExtractor implements TextExtractor {
|
||||
private static final int BOM_LEN = 0; //disabled prepending of BOM
|
||||
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
|
||||
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
|
||||
private KeywordSearchIngestModule module;
|
||||
private AbstractFile sourceFile;
|
||||
private int numChunks = 0;
|
||||
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
||||
@ -58,8 +57,7 @@ class StringsTextExtractor implements TextExtractor {
|
||||
//stringChunkBuf[1] = (byte) 0xBB;
|
||||
//stringChunkBuf[2] = (byte) 0xBF;
|
||||
//}
|
||||
public StringsTextExtractor(KeywordSearchIngestModule module) {
|
||||
this.module = module;
|
||||
public StringsTextExtractor() {
|
||||
ingester = Server.getIngester();
|
||||
extractScripts.add(DEFAULT_SCRIPT);
|
||||
}
|
||||
@ -130,6 +128,14 @@ class StringsTextExtractor implements TextExtractor {
|
||||
final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
||||
long readSize;
|
||||
while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
try {
|
||||
stringStream.close();
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
|
||||
}
|
||||
return false;
|
||||
}
|
||||
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
|
||||
//debug.write(stringChunkBuf, 0, (int)readSize);
|
||||
|
||||
|
@ -34,17 +34,17 @@ import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.logging.Level;
|
||||
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||
@ -65,14 +65,12 @@ class TikaTextExtractor implements TextExtractor {
|
||||
private static final int SINGLE_READ_CHARS = 1024;
|
||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
private final KeywordSearchIngestModule module;
|
||||
private AbstractFile sourceFile; //currently processed file
|
||||
private int numChunks = 0;
|
||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
|
||||
|
||||
TikaTextExtractor(KeywordSearchIngestModule module) {
|
||||
this.module = module;
|
||||
TikaTextExtractor() {
|
||||
ingester = Server.getIngester();
|
||||
|
||||
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
|
||||
@ -112,7 +110,7 @@ class TikaTextExtractor implements TextExtractor {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
numChunks = 0; //unknown until indexing is done
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user