Upgraded to Tesseract 4.0

This commit is contained in:
U-BASIS\dsmyda 2018-12-18 13:08:16 -05:00
parent 2bf92ad219
commit b2611465ef
34 changed files with 60 additions and 376075 deletions

View File

@ -29,8 +29,8 @@ import java.io.InputStream;
import java.io.PushbackReader; import java.io.PushbackReader;
import java.io.Reader; import java.io.Reader;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
@ -60,6 +60,7 @@ import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.ExecUtil; import org.sleuthkit.autopsy.coreutils.ExecUtil;
import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator; import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
import org.sleuthkit.autopsy.coreutils.FileUtil;
import org.sleuthkit.autopsy.coreutils.PlatformUtil; import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig; import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig;
import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.datamodel.ContentUtils;
@ -122,8 +123,8 @@ final class TikaTextExtractor extends TextExtractor {
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
private final ThreadFactory tikaThreadFactory = private final ThreadFactory tikaThreadFactory
new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build(); = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory); private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
@ -135,8 +136,10 @@ final class TikaTextExtractor extends TextExtractor {
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final File TESSERACT_PATH = locateTesseractExecutable();
private static final String LANGUAGE_PACKS = getLanguagePacks(); private static final String LANGUAGE_PACKS = getLanguagePacks();
private static final String TESSERACT_LANGUAGE_PACK_EXT = "traineddata"; //NON-NLS
private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
private ProcessTerminator processTerminator; private ProcessTerminator processTerminator;
private static final String TESSERACT_OUTPUT_FILE_NAME = "output";
private static final List<String> TIKA_SUPPORTED_TYPES private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext()) = new Tika().getParser().getSupportedTypes(new ParseContext())
@ -182,7 +185,7 @@ final class TikaTextExtractor extends TextExtractor {
AbstractFile file = ((AbstractFile) content); AbstractFile file = ((AbstractFile) content);
//Run OCR on images with Tesseract directly. //Run OCR on images with Tesseract directly.
if (file.getMIMEType().toLowerCase().startsWith("image/")) { if (file.getMIMEType().toLowerCase().startsWith("image/")) {
stream = runOcrAndGetOutputStream(file); stream = performOCR(file);
} else { } else {
//Otherwise, go through Tika for PDFs so that it can //Otherwise, go through Tika for PDFs so that it can
//extract images and run Tesseract on them. //extract images and run Tesseract on them.
@ -201,15 +204,15 @@ final class TikaTextExtractor extends TextExtractor {
String tesseractFolder = TESSERACT_PATH.getParent(); String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder); ocrConfig.setTesseractPath(tesseractFolder);
/* /*
* Tesseract expects language data packs to be in a * Tesseract expects language data packs to be in a subdirectory
* subdirectory of tesseractFolder, in a folder called * of tesseractFolder, in a folder called "tessdata". If they
* "tessdata". If they are stored somewhere else, use * are stored somewhere else, use
* ocrConfig.setTessdataPath(String tessdataPath) to point * ocrConfig.setTessdataPath(String tessdataPath) to point to
* to them * them
*/ */
ocrConfig.setLanguage(LANGUAGE_PACKS); ocrConfig.setLanguage(LANGUAGE_PACKS);
parseContext.set(TesseractOCRConfig.class, ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig);
stream = new ReadContentInputStream(content); stream = new ReadContentInputStream(content);
} }
} else { } else {
@ -228,8 +231,7 @@ final class TikaTextExtractor extends TextExtractor {
Future<Reader> future = executorService.submit( Future<Reader> future = executorService.submit(
new GetTikaReader(parser, stream, metadata, parseContext)); new GetTikaReader(parser, stream, metadata, parseContext));
try { try {
final Reader tikaReader = future.get(getTimeout(content.getSize()), final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
TimeUnit.SECONDS);
//check if the reader is empty //check if the reader is empty
PushbackReader pushbackReader = new PushbackReader(tikaReader); PushbackReader pushbackReader = new PushbackReader(tikaReader);
int read = pushbackReader.read(); int read = pushbackReader.read();
@ -238,11 +240,9 @@ final class TikaTextExtractor extends TextExtractor {
+ "Tika returned empty reader for " + content); + "Tika returned empty reader for " + content);
} }
pushbackReader.unread(read); pushbackReader.unread(read);
//concatenate parsed content and meta data into a single reader. //concatenate parsed content and meta data into a single reader.
CharSource metaDataCharSource = getMetaDataCharSource(metadata); CharSource metaDataCharSource = getMetaDataCharSource(metadata);
return CharSource.concat(new ReaderCharSource(pushbackReader), return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
metaDataCharSource).openStream();
} catch (TimeoutException te) { } catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(), final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text", "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
@ -273,19 +273,19 @@ final class TikaTextExtractor extends TextExtractor {
* @throws * @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException * org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
*/ */
private InputStream runOcrAndGetOutputStream(AbstractFile file) throws ExtractionException { private InputStream performOCR(AbstractFile file) throws ExtractionException {
File inputFile = null; File inputFile = null;
File outputFile = null; File outputFile = null;
try { try {
String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
//Appending file id makes the name unique //Appending file id makes the name unique
String tempFileName = file.getId() + file.getName(); String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(), inputFile = Paths.get(tempDirectory, tempFileName).toFile();
tempFileName).toFile();
ContentUtils.writeToFile(content, inputFile); ContentUtils.writeToFile(content, inputFile);
String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME; String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(), String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
tempOutputName).toString();
String executeablePath = TESSERACT_PATH.toString(); String executeablePath = TESSERACT_PATH.toString();
//Build tesseract commands //Build tesseract commands
@ -303,7 +303,7 @@ final class TikaTextExtractor extends TextExtractor {
} else { } else {
ExecUtil.execute(process); ExecUtil.execute(process);
} }
outputFile = new File(outputFilePath + ".txt"); outputFile = new File(outputFilePath + ".txt");
//Open a stream of the Tesseract text file and send this to Tika //Open a stream of the Tesseract text file and send this to Tika
return new CleanUpStream(outputFile); return new CleanUpStream(outputFile);
@ -324,6 +324,7 @@ final class TikaTextExtractor extends TextExtractor {
* cancelled. * cancelled.
*/ */
private class GetTikaReader implements Callable<Reader> { private class GetTikaReader implements Callable<Reader> {
private final AutoDetectParser parser; private final AutoDetectParser parser;
private final InputStream stream; private final InputStream stream;
private final Metadata metadata; private final Metadata metadata;
@ -354,9 +355,10 @@ final class TikaTextExtractor extends TextExtractor {
/** /**
* Store a reference to file on construction * Store a reference to file on construction
* *
* @param file * @param file
* @throws FileNotFoundException *
* @throws FileNotFoundException
*/ */
public CleanUpStream(File file) throws FileNotFoundException { public CleanUpStream(File file) throws FileNotFoundException {
super(file); super(file);
@ -365,8 +367,8 @@ final class TikaTextExtractor extends TextExtractor {
/** /**
* Delete this underlying file when close is called. * Delete this underlying file when close is called.
* *
* @throws IOException * @throws IOException
*/ */
@Override @Override
public void close() throws IOException { public void close() throws IOException {
@ -450,25 +452,19 @@ final class TikaTextExtractor extends TextExtractor {
*/ */
private static String getLanguagePacks() { private static String getLanguagePacks() {
File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata"); File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
//Acceptable extensions for Tesseract-OCR version 3.05 language packs. if (!languagePackRootDir.exists()) {
//All extensions other than traineddata are associated with cube files that return "";
//have been made obsolete since version 4.0. }
List<String> acceptableExtensions = Arrays.asList("traineddata", "params",
"lm", "fold", "bigrams", "nn", "word-freq", "size", List<String> languagePacks = new ArrayList<>();
"user-patterns", "user-words"); for (File languagePack : languagePackRootDir.listFiles()) {
//Pull out only unique languagePacks String fileExt = FilenameUtils.getExtension(languagePack.getName());
HashSet<String> languagePacks = new HashSet<>(); if (!languagePack.isDirectory() && TESSERACT_LANGUAGE_PACK_EXT.equals(fileExt)) {
if (languagePackRootDir.exists()) { String packageName = FilenameUtils.getBaseName(languagePack.getName());
for (File languagePack : languagePackRootDir.listFiles()) { languagePacks.add(packageName);
if (languagePack.isDirectory() || !acceptableExtensions.contains(
FilenameUtils.getExtension(languagePack.getName()))) {
continue;
}
String threeLetterPackageName = languagePack.getName().substring(0, 3);
//Ignore the eng language pack if accidentally added
languagePacks.add(threeLetterPackageName);
} }
} }
return String.join("+", languagePacks); return String.join("+", languagePacks);
} }
@ -499,8 +495,8 @@ final class TikaTextExtractor extends TextExtractor {
* Determines how the extraction process will proceed given the settings * Determines how the extraction process will proceed given the settings
* stored in this context instance. * stored in this context instance.
* *
* See the ImageConfig class in the extractionconfigs package * See the ImageConfig class in the extractionconfigs package for available
for available settings. * settings.
* *
* @param context Instance containing config classes * @param context Instance containing config classes
*/ */

View File

@ -31,6 +31,7 @@ import org.openide.util.NbBundle.Messages;
import org.openide.util.lookup.Lookups; import org.openide.util.lookup.Lookups;
import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil; import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
import org.sleuthkit.autopsy.ingest.FileIngestModule; import org.sleuthkit.autopsy.ingest.FileIngestModule;
@ -477,10 +478,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException { private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
ImageConfig imageConfig = new ImageConfig(); ImageConfig imageConfig = new ImageConfig();
imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption()); imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
Lookup extractionContext = Lookups.fixed(imageConfig); ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
try { try {
Reader specializedReader = TextReaders.getReader(aFile,extractionContext); Reader specializedReader = TextReaders.getReader(aFile, extractionContext);
//divide into chunks and index //divide into chunks and index
return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context); return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
} catch (TextReaders.NoTextReaderFound ex) { } catch (TextReaders.NoTextReaderFound ex) {

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libtesseract-4.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/lstmeval.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/lstmtraining.exe vendored Executable file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,13 @@
disable_character_fragments T
file_type .bl
textord_fast_pitch_test T
tessedit_single_match 0
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
il1_adaption_test 1
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_train_line_recognizer T
textord_no_rejects T

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +0,0 @@
0oO
lI1
cC
kK
pP
sS
uU
vV
wW
xX
yY
zZ

View File

@ -1,7 +0,0 @@
LeadPunc="({[`'
TrailPunc=}:;-]!?`,.)"'
NumLeadPunc=#({[@$
NumTrailPunc=}):;].,%
Operators=*+-/.:,()[]
Digits=0123456789
Alphas=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

Binary file not shown.

View File

@ -1,14 +0,0 @@
RecoWgt=1.0
SizeWgt=0.2435
OODWgt=0.0214
NumWgt=0.036
CharBigramsWgt=0.1567
MaxSegPerChar=8
BeamWidth=10
ConvGridSize=48
WordUnigramsWgt=0.01
MaxWordAspectRatio=20.0000
MinSpaceHeightRatio=0.5000
MaxSpaceHeightRatio=0.6000
HistWindWid=2
MinConCompSize=0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,2 +0,0 @@
1-\d\d\d-GOOG-411
www.\n\\\*.com

View File

@ -1,5 +0,0 @@
the
quick
brown
fox
jumped

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.