mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 02:07:42 +00:00
Worked towards cancellation policy.
This commit is contained in:
parent
2754c50395
commit
738c446d1c
@ -28,6 +28,7 @@ import java.util.Map;
|
|||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
|
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
@ -48,7 +49,6 @@ class HtmlTextExtractor implements TextExtractor {
|
|||||||
private static final int MAX_SIZE = 50000000;
|
private static final int MAX_SIZE = 50000000;
|
||||||
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
|
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
|
||||||
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||||
private KeywordSearchIngestModule module;
|
|
||||||
private AbstractFile sourceFile;
|
private AbstractFile sourceFile;
|
||||||
private int numChunks = 0;
|
private int numChunks = 0;
|
||||||
|
|
||||||
@ -63,8 +63,7 @@ class HtmlTextExtractor implements TextExtractor {
|
|||||||
//"application/xml-dtd",
|
//"application/xml-dtd",
|
||||||
);
|
);
|
||||||
|
|
||||||
HtmlTextExtractor(KeywordSearchIngestModule module) {
|
HtmlTextExtractor() {
|
||||||
this.module = module;
|
|
||||||
ingester = Server.getIngester();
|
ingester = Server.getIngester();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -98,7 +97,7 @@ class HtmlTextExtractor implements TextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean index(AbstractFile sourceFile) throws IngesterException {
|
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
|
||||||
this.sourceFile = sourceFile;
|
this.sourceFile = sourceFile;
|
||||||
numChunks = 0; //unknown until indexing is done
|
numChunks = 0; //unknown until indexing is done
|
||||||
|
|
||||||
|
@ -206,14 +206,14 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//initialize extractors
|
//initialize extractors
|
||||||
stringExtractor = new StringsTextExtractor(this);
|
stringExtractor = new StringsTextExtractor();
|
||||||
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
|
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
|
||||||
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
|
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
|
||||||
|
|
||||||
textExtractors = new ArrayList<>();
|
textExtractors = new ArrayList<>();
|
||||||
//order matters, more specific extractors first
|
//order matters, more specific extractors first
|
||||||
textExtractors.add(new HtmlTextExtractor(this));
|
textExtractors.add(new HtmlTextExtractor());
|
||||||
textExtractors.add(new TikaTextExtractor(this));
|
textExtractors.add(new TikaTextExtractor());
|
||||||
|
|
||||||
indexer = new Indexer();
|
indexer = new Indexer();
|
||||||
initialized = true;
|
initialized = true;
|
||||||
@ -417,7 +417,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
|
|
||||||
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
||||||
//divide into chunks and index
|
//divide into chunks and index
|
||||||
return fileExtract.index(aFile);
|
return fileExtract.index(aFile, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -496,9 +496,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
String detectedFormat;
|
String fileType;
|
||||||
try {
|
try {
|
||||||
detectedFormat = fileTypeDetector.getFileType(aFile);
|
fileType = fileTypeDetector.getFileType(aFile);
|
||||||
} catch (TskCoreException ex) {
|
} catch (TskCoreException ex) {
|
||||||
logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
|
logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
|
||||||
return;
|
return;
|
||||||
@ -506,7 +506,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
|
|
||||||
// we skip archive formats that are opened by the archive module.
|
// we skip archive formats that are opened by the archive module.
|
||||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||||
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
|
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||||
try {
|
try {
|
||||||
ingester.ingest(aFile, false); //meta-data only
|
ingester.ingest(aFile, false); //meta-data only
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
|
||||||
@ -518,11 +518,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
}
|
}
|
||||||
|
|
||||||
boolean wasTextAdded = false;
|
boolean wasTextAdded = false;
|
||||||
if (isTextExtractSupported(aFile, detectedFormat)) {
|
if (isTextExtractSupported(aFile, fileType)) {
|
||||||
//extract text with one of the extractors, divide into chunks and index with Solr
|
//extract text with one of the extractors, divide into chunks and index with Solr
|
||||||
try {
|
try {
|
||||||
//logger.log(Level.INFO, "indexing: " + aFile.getName());
|
//logger.log(Level.INFO, "indexing: " + aFile.getName());
|
||||||
if (!extractTextAndIndex(aFile, detectedFormat)) {
|
if (!extractTextAndIndex(aFile, fileType)) {
|
||||||
logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
|
logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
|
||||||
} else {
|
} else {
|
||||||
|
@ -45,7 +45,6 @@ class StringsTextExtractor implements TextExtractor {
|
|||||||
private static final int BOM_LEN = 0; //disabled prepending of BOM
|
private static final int BOM_LEN = 0; //disabled prepending of BOM
|
||||||
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
|
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
|
||||||
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
|
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
|
||||||
private KeywordSearchIngestModule module;
|
|
||||||
private AbstractFile sourceFile;
|
private AbstractFile sourceFile;
|
||||||
private int numChunks = 0;
|
private int numChunks = 0;
|
||||||
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
||||||
@ -58,8 +57,7 @@ class StringsTextExtractor implements TextExtractor {
|
|||||||
//stringChunkBuf[1] = (byte) 0xBB;
|
//stringChunkBuf[1] = (byte) 0xBB;
|
||||||
//stringChunkBuf[2] = (byte) 0xBF;
|
//stringChunkBuf[2] = (byte) 0xBF;
|
||||||
//}
|
//}
|
||||||
public StringsTextExtractor(KeywordSearchIngestModule module) {
|
public StringsTextExtractor() {
|
||||||
this.module = module;
|
|
||||||
ingester = Server.getIngester();
|
ingester = Server.getIngester();
|
||||||
extractScripts.add(DEFAULT_SCRIPT);
|
extractScripts.add(DEFAULT_SCRIPT);
|
||||||
}
|
}
|
||||||
@ -130,6 +128,14 @@ class StringsTextExtractor implements TextExtractor {
|
|||||||
final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
||||||
long readSize;
|
long readSize;
|
||||||
while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
|
while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
|
||||||
|
if (context.fileIngestIsCancelled()) {
|
||||||
|
try {
|
||||||
|
stringStream.close();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
|
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
|
||||||
//debug.write(stringChunkBuf, 0, (int)readSize);
|
//debug.write(stringChunkBuf, 0, (int)readSize);
|
||||||
|
|
||||||
|
@ -34,17 +34,17 @@ import java.util.concurrent.Future;
|
|||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
|
|
||||||
import org.openide.util.NbBundle;
|
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
|
||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.mime.MediaType;
|
import org.apache.tika.mime.MediaType;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.openide.util.NbBundle;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
|
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||||
@ -65,14 +65,12 @@ class TikaTextExtractor implements TextExtractor {
|
|||||||
private static final int SINGLE_READ_CHARS = 1024;
|
private static final int SINGLE_READ_CHARS = 1024;
|
||||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||||
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||||
private final KeywordSearchIngestModule module;
|
|
||||||
private AbstractFile sourceFile; //currently processed file
|
private AbstractFile sourceFile; //currently processed file
|
||||||
private int numChunks = 0;
|
private int numChunks = 0;
|
||||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||||
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
|
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
|
||||||
|
|
||||||
TikaTextExtractor(KeywordSearchIngestModule module) {
|
TikaTextExtractor() {
|
||||||
this.module = module;
|
|
||||||
ingester = Server.getIngester();
|
ingester = Server.getIngester();
|
||||||
|
|
||||||
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
|
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
|
||||||
@ -112,7 +110,7 @@ class TikaTextExtractor implements TextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
|
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
|
||||||
this.sourceFile = sourceFile;
|
this.sourceFile = sourceFile;
|
||||||
numChunks = 0; //unknown until indexing is done
|
numChunks = 0; //unknown until indexing is done
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user