diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index 0d7d4b9a0f..74080e2fc4 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -49,7 +49,6 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParsingReader; import org.apache.tika.parser.microsoft.OfficeParserConfig; -import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; import org.openide.util.NbBundle; import org.openide.modules.InstalledFileLocator; @@ -77,6 +76,7 @@ import java.util.ArrayList; import java.util.Set; import org.apache.tika.config.TikaConfig; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY; import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; @@ -283,7 +283,12 @@ final class TikaTextExtractor implements TextExtractor { if (isOcrSupported()) { // Configure OCR for Tika if it chooses to run OCR // during extraction - TesseractOCRConfig ocrConfig = getTesseractConfig(); + TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); + String tesseractFolder = TESSERACT_PATH.getParent(); + // coming from https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454096#TikaOCR-OverridingDefaultConfiguration + ocrConfig.getOtherTesseractConfig().put("tessdataPath", PlatformUtil.getOcrLanguagePacksPath()); + ocrConfig.getOtherTesseractConfig().put("tesseractPath", tesseractFolder); + ocrConfig.setLanguage(languagePacks); parseContext.set(TesseractOCRConfig.class, ocrConfig); // Configure how Tika handles OCRing PDFs @@ -344,16 +349,6 @@ final class TikaTextExtractor implements TextExtractor { future.cancel(true); } } - - private TesseractOCRConfig getTesseractConfig() { - // GVDTODO look at this: https://tika.apache.org/3.0.0/configuring.html - - // String tesseractFolder = TESSERACT_PATH.getParent(); - // ocrConfig.setTesseractPath(tesseractFolder); - // ocrConfig.setLanguage(languagePacks); - // ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); - return null; - } /** * Run OCR and return the file stream produced by Tesseract.