mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
update
This commit is contained in:
parent
143b20794d
commit
cfd3c4e28c
@ -49,7 +49,6 @@ import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.ParsingReader;
|
||||
import org.apache.tika.parser.microsoft.OfficeParserConfig;
|
||||
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.modules.InstalledFileLocator;
|
||||
@ -77,6 +76,7 @@ import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.mime.MimeTypes;
|
||||
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
|
||||
import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator;
|
||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||
@ -283,7 +283,12 @@ final class TikaTextExtractor implements TextExtractor {
|
||||
if (isOcrSupported()) {
|
||||
// Configure OCR for Tika if it chooses to run OCR
|
||||
// during extraction
|
||||
TesseractOCRConfig ocrConfig = getTesseractConfig();
|
||||
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||||
String tesseractFolder = TESSERACT_PATH.getParent();
|
||||
// coming from https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454096#TikaOCR-OverridingDefaultConfiguration
|
||||
ocrConfig.getOtherTesseractConfig().put("tessdataPath", PlatformUtil.getOcrLanguagePacksPath());
|
||||
ocrConfig.getOtherTesseractConfig().put("tesseractPath", tesseractFolder);
|
||||
ocrConfig.setLanguage(languagePacks);
|
||||
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
||||
|
||||
// Configure how Tika handles OCRing PDFs
|
||||
@ -344,16 +349,6 @@ final class TikaTextExtractor implements TextExtractor {
|
||||
future.cancel(true);
|
||||
}
|
||||
}
|
||||
|
||||
private TesseractOCRConfig getTesseractConfig() {
|
||||
// GVDTODO look at this: https://tika.apache.org/3.0.0/configuring.html
|
||||
|
||||
// String tesseractFolder = TESSERACT_PATH.getParent();
|
||||
// ocrConfig.setTesseractPath(tesseractFolder);
|
||||
// ocrConfig.setLanguage(languagePacks);
|
||||
// ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run OCR and return the file stream produced by Tesseract.
|
||||
|
Loading…
x
Reference in New Issue
Block a user