This commit is contained in:
Greg DiCristofaro 2025-01-08 15:28:32 -05:00
parent 143b20794d
commit cfd3c4e28c
No known key found for this signature in database

View File

@ -49,7 +49,6 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.openide.util.NbBundle;
import org.openide.modules.InstalledFileLocator;
@ -77,6 +76,7 @@ import java.util.ArrayList;
import java.util.Set;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
@ -283,7 +283,12 @@ final class TikaTextExtractor implements TextExtractor {
if (isOcrSupported()) {
// Configure OCR for Tika if it chooses to run OCR
// during extraction
TesseractOCRConfig ocrConfig = getTesseractConfig();
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
String tesseractFolder = TESSERACT_PATH.getParent();
// coming from https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454096#TikaOCR-OverridingDefaultConfiguration
ocrConfig.getOtherTesseractConfig().put("tessdataPath", PlatformUtil.getOcrLanguagePacksPath());
ocrConfig.getOtherTesseractConfig().put("tesseractPath", tesseractFolder);
ocrConfig.setLanguage(languagePacks);
parseContext.set(TesseractOCRConfig.class, ocrConfig);
// Configure how Tika handles OCRing PDFs
@ -345,16 +350,6 @@ final class TikaTextExtractor implements TextExtractor {
}
}
private TesseractOCRConfig getTesseractConfig() {
// GVDTODO look at this: https://tika.apache.org/3.0.0/configuring.html
// String tesseractFolder = TESSERACT_PATH.getParent();
// ocrConfig.setTesseractPath(tesseractFolder);
// ocrConfig.setLanguage(languagePacks);
// ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
return null;
}
/**
* Run OCR and return the file stream produced by Tesseract.
*