mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
Merge pull request #4631 from wschaeferB/4865-FixNpeInTextFileExtractor
4865 fix npe in text file extractor
This commit is contained in:
commit
50595543dc
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2018 Basis Technology Corp.
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -25,7 +25,6 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.logging.Level;
|
||||
import org.openide.util.Exceptions;
|
||||
import org.openide.util.Lookup;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.NbBundle.Messages;
|
||||
@ -71,9 +70,11 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
|
||||
"CannotRunFileTypeDetection=Unable to run file type detection."
|
||||
})
|
||||
public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
|
||||
/** generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them */
|
||||
|
||||
/**
|
||||
* generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them
|
||||
*/
|
||||
private static final List<String> ARCHIVE_MIME_TYPES
|
||||
= ImmutableList.of(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
@ -108,7 +109,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
|
||||
/**
|
||||
* Options for this extractor
|
||||
*/
|
||||
@ -117,7 +118,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
EXTRACT_UTF8, ///< extract UTF8 text, true/false
|
||||
};
|
||||
|
||||
|
||||
enum UpdateFrequency {
|
||||
|
||||
FAST(20),
|
||||
@ -290,15 +290,15 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
StringsConfig stringsConfig = new StringsConfig();
|
||||
Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
|
||||
stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
|
||||
stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
|
||||
stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
|
||||
|
||||
|
||||
stringsExtractionContext = Lookups.fixed(stringsConfig);
|
||||
|
||||
|
||||
indexer = new Indexer();
|
||||
initialized = true;
|
||||
}
|
||||
@ -482,12 +482,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
|
||||
ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
|
||||
Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
|
||||
|
||||
|
||||
try {
|
||||
TextExtractor extractor = TextExtractorFactory.getExtractor(aFile,extractionContext);
|
||||
TextExtractor extractor = TextExtractorFactory.getExtractor(aFile, extractionContext);
|
||||
Reader extractedTextReader = extractor.getReader();
|
||||
//divide into chunks and index
|
||||
return Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, context);
|
||||
return Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, context);
|
||||
} catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
|
||||
//No text extractor found... run the default instead
|
||||
return false;
|
||||
@ -509,7 +509,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
}
|
||||
TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
|
||||
Reader extractedTextReader = stringsExtractor.getReader();
|
||||
if (Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
|
||||
if (Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
||||
return true;
|
||||
} else {
|
||||
@ -619,12 +619,16 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
try {
|
||||
TextFileExtractor textFileExtractor = new TextFileExtractor();
|
||||
Reader textReader = textFileExtractor.getReader(aFile);
|
||||
if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||
if (textReader == null) {
|
||||
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
|
||||
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
||||
wasTextAdded = true;
|
||||
}
|
||||
} catch (IngesterException | TextFileExtractorException ex) {
|
||||
} catch (IngesterException ex) {
|
||||
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
||||
} catch (TextFileExtractorException ex) {
|
||||
logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2018 Basis Technology Corp.
|
||||
* Copyright 2018-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -48,7 +48,9 @@ final class TextFileExtractor {
|
||||
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
||||
}
|
||||
CharsetMatch match = detector.detect();
|
||||
if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
||||
if (match == null) {
|
||||
throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
|
||||
} else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
||||
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user