mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-08 22:29:33 +00:00
update
This commit is contained in:
parent
143b20794d
commit
cfd3c4e28c
@ -49,7 +49,6 @@ import org.apache.tika.parser.ParseContext;
|
|||||||
import org.apache.tika.parser.Parser;
|
import org.apache.tika.parser.Parser;
|
||||||
import org.apache.tika.parser.ParsingReader;
|
import org.apache.tika.parser.ParsingReader;
|
||||||
import org.apache.tika.parser.microsoft.OfficeParserConfig;
|
import org.apache.tika.parser.microsoft.OfficeParserConfig;
|
||||||
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
|
||||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.modules.InstalledFileLocator;
|
import org.openide.modules.InstalledFileLocator;
|
||||||
@ -77,6 +76,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import org.apache.tika.config.TikaConfig;
|
import org.apache.tika.config.TikaConfig;
|
||||||
import org.apache.tika.mime.MimeTypes;
|
import org.apache.tika.mime.MimeTypes;
|
||||||
|
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
||||||
import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
|
import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
|
||||||
import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator;
|
import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator;
|
||||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||||
@ -283,7 +283,12 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
if (isOcrSupported()) {
|
if (isOcrSupported()) {
|
||||||
// Configure OCR for Tika if it chooses to run OCR
|
// Configure OCR for Tika if it chooses to run OCR
|
||||||
// during extraction
|
// during extraction
|
||||||
TesseractOCRConfig ocrConfig = getTesseractConfig();
|
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||||||
|
String tesseractFolder = TESSERACT_PATH.getParent();
|
||||||
|
// coming from https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454096#TikaOCR-OverridingDefaultConfiguration
|
||||||
|
ocrConfig.getOtherTesseractConfig().put("tessdataPath", PlatformUtil.getOcrLanguagePacksPath());
|
||||||
|
ocrConfig.getOtherTesseractConfig().put("tesseractPath", tesseractFolder);
|
||||||
|
ocrConfig.setLanguage(languagePacks);
|
||||||
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
||||||
|
|
||||||
// Configure how Tika handles OCRing PDFs
|
// Configure how Tika handles OCRing PDFs
|
||||||
@ -344,16 +349,6 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
future.cancel(true);
|
future.cancel(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private TesseractOCRConfig getTesseractConfig() {
|
|
||||||
// GVDTODO look at this: https://tika.apache.org/3.0.0/configuring.html
|
|
||||||
|
|
||||||
// String tesseractFolder = TESSERACT_PATH.getParent();
|
|
||||||
// ocrConfig.setTesseractPath(tesseractFolder);
|
|
||||||
// ocrConfig.setLanguage(languagePacks);
|
|
||||||
// ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run OCR and return the file stream produced by Tesseract.
|
* Run OCR and return the file stream produced by Tesseract.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user