This commit is contained in:
Ethan Roseman 2019-08-14 17:32:35 -04:00
parent f8545851e4
commit 5d46793d45
2 changed files with 16 additions and 12 deletions

View File

@ -628,6 +628,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
boolean wasTextAdded = false;
Charset decodetectCharset = null;
//extract text with one of the extractors, divide into chunks and index with Solr
try {
//logger.log(Level.INFO, "indexing: " + aFile.getName());
@ -638,13 +639,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
extractStringsAndIndex(aFile);
return;
}
if (fileType.equals(MimeTypes.PLAIN_TEXT)) {
Charset detectedCharset = TextExtractor.getDecodetectCharset(aFile);
if (detectedCharset != null) {
indexTextFile(aFile);
decodetectCharset = TextExtractor.getDecodetectCharset(aFile);
if (fileType.equals(MimeTypes.PLAIN_TEXT) && decodetectCharset != null) {
indexTextFile(aFile, decodetectCharset);
return;
}
}
if (!extractTextAndIndex(aFile)) {
// Text extractor not found for file. Extract string only.
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
@ -666,7 +665,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
//Carved Files should be the only type of unallocated files capable of a txt extension and
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
wasTextAdded = indexTextFile(aFile);
wasTextAdded = indexTextFile(aFile, decodetectCharset);
}
// if it wasn't supported or had an error, default to strings
@ -675,9 +674,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
}
}
private boolean indexTextFile(AbstractFile aFile) {
private boolean indexTextFile(AbstractFile aFile, Charset detectedCharset) {
try {
TextFileExtractor textFileExtractor = new TextFileExtractor();
TextFileExtractor textFileExtractor = new TextFileExtractor(detectedCharset);
Reader textReader = textFileExtractor.getReader(aFile);
if (textReader == null) {
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());

View File

@ -42,12 +42,17 @@ final class TextFileExtractor {
//files while hopefully working on all files with a valid text encoding
static final private int MIN_MATCH_CONFIDENCE = 20;
private final Charset detectedCharset;
TextFileExtractor(Charset detectedCharset) {
this.detectedCharset = detectedCharset;
}
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
String mimeType = source.getMIMEType();
if (mimeType.equals(MimeTypes.PLAIN_TEXT)) {
Charset decodetectCharset = TextExtractor.getDecodetectCharset(source);
if (decodetectCharset != null) {
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), decodetectCharset);
if (detectedCharset != null) {
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), detectedCharset);
}
}