Only turn off embedded extraction for known mime-types

This commit is contained in:
U-BASIS\dsmyda 2019-04-30 12:28:04 -04:00
parent 8a846b4937
commit 35e3934816

View File

@ -120,6 +120,16 @@ final class TikaTextExtractor implements TextExtractor {
"application/x-z", //NON-NLS "application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS "application/x-compress"); //NON-NLS
//Tika should ignore types with embedded files that can be handled by the unpacking modules
private static final List<String> EMBEDDED_FILE_MIME_TYPES
= ImmutableList.of("application/msword", //NON-NLS
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
"application/vnd.ms-powerpoint", //NON-NLS
"application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
"application/vnd.ms-excel", //NON-NLS
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
"application/pdf"); //NON-NLS
private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName()); private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
@ -137,7 +147,7 @@ final class TikaTextExtractor implements TextExtractor {
private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final File TESSERACT_PATH = locateTesseractExecutable();
private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks()); private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
private ProcessTerminator processTerminator; private ProcessTerminator processTerminator;
private static final List<String> TIKA_SUPPORTED_TYPES private static final List<String> TIKA_SUPPORTED_TYPES
@ -152,8 +162,8 @@ final class TikaTextExtractor implements TextExtractor {
/** /**
* If Tesseract has been installed and is set to be used through * If Tesseract has been installed and is set to be used through
* configuration, then ocr is enabled. OCR can only currently be run on * configuration, then ocr is enabled. OCR can only currently be run on 64
* 64 bit Windows OS. * bit Windows OS.
* *
* @return Flag indicating if OCR is set to be used. * @return Flag indicating if OCR is set to be used.
*/ */
@ -178,10 +188,14 @@ final class TikaTextExtractor implements TextExtractor {
InputStream stream = null; InputStream stream = null;
ParseContext parseContext = new ParseContext(); ParseContext parseContext = new ParseContext();
//Disable appending embedded file text to output //Disable appending embedded file text to output for EFE supported types
//JIRA-4975 //JIRA-4975
parseContext.set(Parser.class, new EmptyParser()); if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
parseContext.set(Parser.class, new EmptyParser());
} else {
parseContext.set(Parser.class, parser);
}
if (ocrEnabled() && content instanceof AbstractFile) { if (ocrEnabled() && content instanceof AbstractFile) {
AbstractFile file = ((AbstractFile) content); AbstractFile file = ((AbstractFile) content);
@ -205,7 +219,7 @@ final class TikaTextExtractor implements TextExtractor {
TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
String tesseractFolder = TESSERACT_PATH.getParent(); String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder); ocrConfig.setTesseractPath(tesseractFolder);
ocrConfig.setLanguage(languagePacks); ocrConfig.setLanguage(languagePacks);
ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
parseContext.set(TesseractOCRConfig.class, ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig);
@ -277,7 +291,7 @@ final class TikaTextExtractor implements TextExtractor {
File outputFile = null; File outputFile = null;
try { try {
String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory(); String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
//Appending file id makes the name unique //Appending file id makes the name unique
String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName()); String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
inputFile = Paths.get(tempDirectory, tempFileName).toFile(); inputFile = Paths.get(tempDirectory, tempFileName).toFile();
@ -318,7 +332,7 @@ final class TikaTextExtractor implements TextExtractor {
} }
} }
} }
/** /**
* Wraps the creation of a TikaReader into a Future so that it can be * Wraps the creation of a TikaReader into a Future so that it can be
* cancelled. * cancelled.
@ -430,11 +444,11 @@ final class TikaTextExtractor implements TextExtractor {
*/ */
@Override @Override
public boolean isSupported() { public boolean isSupported() {
if(!(content instanceof AbstractFile)) { if (!(content instanceof AbstractFile)) {
return false; return false;
} }
String detectedType = ((AbstractFile)content).getMIMEType(); String detectedType = ((AbstractFile) content).getMIMEType();
if (detectedType == null if (detectedType == null
|| BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used) || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
|| ARCHIVE_MIME_TYPES.contains(detectedType) || ARCHIVE_MIME_TYPES.contains(detectedType)
@ -443,7 +457,7 @@ final class TikaTextExtractor implements TextExtractor {
) { ) {
return false; return false;
} }
return TIKA_SUPPORTED_TYPES.contains(detectedType); return TIKA_SUPPORTED_TYPES.contains(detectedType);
} }
@ -493,11 +507,11 @@ final class TikaTextExtractor implements TextExtractor {
if (context != null) { if (context != null) {
ImageConfig configInstance = context.lookup(ImageConfig.class); ImageConfig configInstance = context.lookup(ImageConfig.class);
if (configInstance != null) { if (configInstance != null) {
if(Objects.nonNull(configInstance.getOCREnabled())) { if (Objects.nonNull(configInstance.getOCREnabled())) {
this.tesseractOCREnabled = configInstance.getOCREnabled(); this.tesseractOCREnabled = configInstance.getOCREnabled();
} }
if(Objects.nonNull(configInstance.getOCRLanguages())) { if (Objects.nonNull(configInstance.getOCRLanguages())) {
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages()); this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
} }
} }