Upgraded to Tesseract 4.0

This commit is contained in:
U-BASIS\dsmyda 2018-12-18 13:08:16 -05:00
parent 2bf92ad219
commit b2611465ef
34 changed files with 60 additions and 376075 deletions

View File

@ -29,8 +29,8 @@ import java.io.InputStream;
import java.io.PushbackReader;
import java.io.Reader;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.Callable;
@ -60,6 +60,7 @@ import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.ExecUtil;
import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
import org.sleuthkit.autopsy.coreutils.FileUtil;
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
@ -122,8 +123,8 @@ final class TikaTextExtractor extends TextExtractor {
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
private final ThreadFactory tikaThreadFactory =
new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
private final ThreadFactory tikaThreadFactory
= new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
@ -135,8 +136,10 @@ final class TikaTextExtractor extends TextExtractor {
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
private static final File TESSERACT_PATH = locateTesseractExecutable();
private static final String LANGUAGE_PACKS = getLanguagePacks();
private static final String TESSERACT_LANGUAGE_PACK_EXT = "traineddata"; //NON-NLS
private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
private ProcessTerminator processTerminator;
private static final String TESSERACT_OUTPUT_FILE_NAME = "output";
private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext())
@ -182,7 +185,7 @@ final class TikaTextExtractor extends TextExtractor {
AbstractFile file = ((AbstractFile) content);
//Run OCR on images with Tesseract directly.
if (file.getMIMEType().toLowerCase().startsWith("image/")) {
stream = runOcrAndGetOutputStream(file);
stream = performOCR(file);
} else {
//Otherwise, go through Tika for PDFs so that it can
//extract images and run Tesseract on them.
@ -201,11 +204,11 @@ final class TikaTextExtractor extends TextExtractor {
String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder);
/*
* Tesseract expects language data packs to be in a
* subdirectory of tesseractFolder, in a folder called
* "tessdata". If they are stored somewhere else, use
* ocrConfig.setTessdataPath(String tessdataPath) to point
* to them
* Tesseract expects language data packs to be in a subdirectory
* of tesseractFolder, in a folder called "tessdata". If they
* are stored somewhere else, use
* ocrConfig.setTessdataPath(String tessdataPath) to point to
* them
*/
ocrConfig.setLanguage(LANGUAGE_PACKS);
parseContext.set(TesseractOCRConfig.class, ocrConfig);
@ -228,8 +231,7 @@ final class TikaTextExtractor extends TextExtractor {
Future<Reader> future = executorService.submit(
new GetTikaReader(parser, stream, metadata, parseContext));
try {
final Reader tikaReader = future.get(getTimeout(content.getSize()),
TimeUnit.SECONDS);
final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
//check if the reader is empty
PushbackReader pushbackReader = new PushbackReader(tikaReader);
int read = pushbackReader.read();
@ -238,11 +240,9 @@ final class TikaTextExtractor extends TextExtractor {
+ "Tika returned empty reader for " + content);
}
pushbackReader.unread(read);
//concatenate parsed content and meta data into a single reader.
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
return CharSource.concat(new ReaderCharSource(pushbackReader),
metaDataCharSource).openStream();
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
} catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
@ -273,19 +273,19 @@ final class TikaTextExtractor extends TextExtractor {
* @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
*/
private InputStream runOcrAndGetOutputStream(AbstractFile file) throws ExtractionException {
private InputStream performOCR(AbstractFile file) throws ExtractionException {
File inputFile = null;
File outputFile = null;
try {
String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
//Appending file id makes the name unique
String tempFileName = file.getId() + file.getName();
inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
tempFileName).toFile();
String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
inputFile = Paths.get(tempDirectory, tempFileName).toFile();
ContentUtils.writeToFile(content, inputFile);
String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME;
String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
tempOutputName).toString();
String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
String executeablePath = TESSERACT_PATH.toString();
//Build tesseract commands
@ -324,6 +324,7 @@ final class TikaTextExtractor extends TextExtractor {
* cancelled.
*/
private class GetTikaReader implements Callable<Reader> {
private final AutoDetectParser parser;
private final InputStream stream;
private final Metadata metadata;
@ -356,6 +357,7 @@ final class TikaTextExtractor extends TextExtractor {
* Store a reference to file on construction
*
* @param file
*
* @throws FileNotFoundException
*/
public CleanUpStream(File file) throws FileNotFoundException {
@ -450,25 +452,19 @@ final class TikaTextExtractor extends TextExtractor {
*/
private static String getLanguagePacks() {
File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
//Acceptable extensions for Tesseract-OCR version 3.05 language packs.
//All extensions other than traineddata are associated with cube files that
//have been made obsolete since version 4.0.
List<String> acceptableExtensions = Arrays.asList("traineddata", "params",
"lm", "fold", "bigrams", "nn", "word-freq", "size",
"user-patterns", "user-words");
//Pull out only unique languagePacks
HashSet<String> languagePacks = new HashSet<>();
if (languagePackRootDir.exists()) {
if (!languagePackRootDir.exists()) {
return "";
}
List<String> languagePacks = new ArrayList<>();
for (File languagePack : languagePackRootDir.listFiles()) {
if (languagePack.isDirectory() || !acceptableExtensions.contains(
FilenameUtils.getExtension(languagePack.getName()))) {
continue;
}
String threeLetterPackageName = languagePack.getName().substring(0, 3);
//Ignore the eng language pack if accidentally added
languagePacks.add(threeLetterPackageName);
String fileExt = FilenameUtils.getExtension(languagePack.getName());
if (!languagePack.isDirectory() && TESSERACT_LANGUAGE_PACK_EXT.equals(fileExt)) {
String packageName = FilenameUtils.getBaseName(languagePack.getName());
languagePacks.add(packageName);
}
}
return String.join("+", languagePacks);
}
@ -499,8 +495,8 @@ final class TikaTextExtractor extends TextExtractor {
* Determines how the extraction process will proceed given the settings
* stored in this context instance.
*
* See the ImageConfig class in the extractionconfigs package
for available settings.
* See the ImageConfig class in the extractionconfigs package for available
* settings.
*
* @param context Instance containing config classes
*/

View File

@ -31,6 +31,7 @@ import org.openide.util.NbBundle.Messages;
import org.openide.util.lookup.Lookups;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
import org.sleuthkit.autopsy.ingest.FileIngestModule;
@ -477,7 +478,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
ImageConfig imageConfig = new ImageConfig();
imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
Lookup extractionContext = Lookups.fixed(imageConfig);
ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
try {
Reader specializedReader = TextReaders.getReader(aFile, extractionContext);

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libtesseract-4.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/lstmeval.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/lstmtraining.exe vendored Executable file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,13 @@
disable_character_fragments T
file_type .bl
textord_fast_pitch_test T
tessedit_single_match 0
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
il1_adaption_test 1
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_train_line_recognizer T
textord_no_rejects T

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +0,0 @@
0oO
lI1
cC
kK
pP
sS
uU
vV
wW
xX
yY
zZ

View File

@ -1,7 +0,0 @@
LeadPunc="({[`'
TrailPunc=}:;-]!?`,.)"'
NumLeadPunc=#({[@$
NumTrailPunc=}):;].,%
Operators=*+-/.:,()[]
Digits=0123456789
Alphas=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

Binary file not shown.

View File

@ -1,14 +0,0 @@
RecoWgt=1.0
SizeWgt=0.2435
OODWgt=0.0214
NumWgt=0.036
CharBigramsWgt=0.1567
MaxSegPerChar=8
BeamWidth=10
ConvGridSize=48
WordUnigramsWgt=0.01
MaxWordAspectRatio=20.0000
MinSpaceHeightRatio=0.5000
MaxSpaceHeightRatio=0.6000
HistWindWid=2
MinConCompSize=0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,2 +0,0 @@
1-\d\d\d-GOOG-411
www.\n\\\*.com

View File

@ -1,5 +0,0 @@
the
quick
brown
fox
jumped

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.