Upgraded to Tesseract 4.0

2025-07-06 21:00:22 +00:00 · 2018-12-18 13:08:16 -05:00 · 2018-12-18 13:08:16 -05:00 · b2611465ef
commit b2611465ef
parent 2bf92ad219
34 changed files with 60 additions and 376075 deletions
--- a/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java
@ -29,8 +29,8 @@ import java.io.InputStream;
 import java.io.PushbackReader;
 import java.io.Reader;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Objects;
 import java.util.concurrent.Callable;
@ -60,6 +60,7 @@ import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.coreutils.ExecUtil;
 import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
 import org.sleuthkit.autopsy.coreutils.FileUtil;
 import org.sleuthkit.autopsy.coreutils.PlatformUtil;
 import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig;
 import org.sleuthkit.autopsy.datamodel.ContentUtils;
@ -122,8 +123,8 @@ final class TikaTextExtractor extends TextExtractor {
    private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
-    private final ThreadFactory tikaThreadFactory = 
+    private final ThreadFactory tikaThreadFactory
-            new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
+            = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
    private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
    private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
@ -135,8 +136,10 @@ final class TikaTextExtractor extends TextExtractor {
    private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
    private static final File TESSERACT_PATH = locateTesseractExecutable();
    private static final String LANGUAGE_PACKS = getLanguagePacks();
    private static final String TESSERACT_LANGUAGE_PACK_EXT = "traineddata"; //NON-NLS
    private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
    private ProcessTerminator processTerminator;
    private static final String TESSERACT_OUTPUT_FILE_NAME = "output";
    private static final List<String> TIKA_SUPPORTED_TYPES
            = new Tika().getParser().getSupportedTypes(new ParseContext())
@ -182,7 +185,7 @@ final class TikaTextExtractor extends TextExtractor {
            AbstractFile file = ((AbstractFile) content);
            //Run OCR on images with Tesseract directly. 
            if (file.getMIMEType().toLowerCase().startsWith("image/")) {
-                stream = runOcrAndGetOutputStream(file);
+                stream = performOCR(file);
            } else {
                //Otherwise, go through Tika for PDFs so that it can
                //extract images and run Tesseract on them.     
@ -201,15 +204,15 @@ final class TikaTextExtractor extends TextExtractor {
                String tesseractFolder = TESSERACT_PATH.getParent();
                ocrConfig.setTesseractPath(tesseractFolder);
                /*
-                 * Tesseract expects language data packs to be in a
+                 * Tesseract expects language data packs to be in a subdirectory
-                 * subdirectory of tesseractFolder, in a folder called
+                 * of tesseractFolder, in a folder called "tessdata". If they
-                 * "tessdata". If they are stored somewhere else, use
+                 * are stored somewhere else, use
-                 * ocrConfig.setTessdataPath(String tessdataPath) to point
+                 * ocrConfig.setTessdataPath(String tessdataPath) to point to
-                 * to them
+                 * them
                 */
                ocrConfig.setLanguage(LANGUAGE_PACKS);
                parseContext.set(TesseractOCRConfig.class, ocrConfig);
-                
+
                stream = new ReadContentInputStream(content);
            }
        } else {
@ -228,8 +231,7 @@ final class TikaTextExtractor extends TextExtractor {
        Future<Reader> future = executorService.submit(
                new GetTikaReader(parser, stream, metadata, parseContext));
        try {
-            final Reader tikaReader = future.get(getTimeout(content.getSize()),
+            final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
                    TimeUnit.SECONDS);
            //check if the reader is empty
            PushbackReader pushbackReader = new PushbackReader(tikaReader);
            int read = pushbackReader.read();
@ -238,11 +240,9 @@ final class TikaTextExtractor extends TextExtractor {
                        + "Tika returned empty reader for " + content);
            }
            pushbackReader.unread(read);
            //concatenate parsed content and meta data into a single reader.
            CharSource metaDataCharSource = getMetaDataCharSource(metadata);
-            return CharSource.concat(new ReaderCharSource(pushbackReader),
+            return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
                    metaDataCharSource).openStream();
        } catch (TimeoutException te) {
            final String msg = NbBundle.getMessage(this.getClass(),
                    "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
@ -273,19 +273,19 @@ final class TikaTextExtractor extends TextExtractor {
     * @throws
     * org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
     */
-    private InputStream runOcrAndGetOutputStream(AbstractFile file) throws ExtractionException {
+    private InputStream performOCR(AbstractFile file) throws ExtractionException {
        File inputFile = null;
        File outputFile = null;
        try {
            String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
            //Appending file id makes the name unique
-            String tempFileName = file.getId() + file.getName();
+            String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
-            inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
+            inputFile = Paths.get(tempDirectory, tempFileName).toFile();
                    tempFileName).toFile();
            ContentUtils.writeToFile(content, inputFile);
-            String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME;
+            String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
-            String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
+            String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
                    tempOutputName).toString();
            String executeablePath = TESSERACT_PATH.toString();
            //Build tesseract commands
@ -303,7 +303,7 @@ final class TikaTextExtractor extends TextExtractor {
            } else {
                ExecUtil.execute(process);
            }
-            
+
            outputFile = new File(outputFilePath + ".txt");
            //Open a stream of the Tesseract text file and send this to Tika
            return new CleanUpStream(outputFile);
@ -324,6 +324,7 @@ final class TikaTextExtractor extends TextExtractor {
     * cancelled.
     */
    private class GetTikaReader implements Callable<Reader> {
        private final AutoDetectParser parser;
        private final InputStream stream;
        private final Metadata metadata;
@ -354,9 +355,10 @@ final class TikaTextExtractor extends TextExtractor {
        /**
         * Store a reference to file on construction
-         * 
+         *
         * @param file
-         * @throws FileNotFoundException 
+         *
         * @throws FileNotFoundException
         */
        public CleanUpStream(File file) throws FileNotFoundException {
            super(file);
@ -365,8 +367,8 @@ final class TikaTextExtractor extends TextExtractor {
        /**
         * Delete this underlying file when close is called.
-         * 
+         *
-         * @throws IOException 
+         * @throws IOException
         */
        @Override
        public void close() throws IOException {
@ -450,25 +452,19 @@ final class TikaTextExtractor extends TextExtractor {
     */
    private static String getLanguagePacks() {
        File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
-        //Acceptable extensions for Tesseract-OCR version 3.05 language packs.
+        if (!languagePackRootDir.exists()) {
-        //All extensions other than traineddata are associated with cube files that
+            return "";
-        //have been made obsolete since version 4.0.
+        }
-        List<String> acceptableExtensions = Arrays.asList("traineddata", "params",
+
-                "lm", "fold", "bigrams", "nn", "word-freq", "size",
+        List<String> languagePacks = new ArrayList<>();
-                "user-patterns", "user-words");
+        for (File languagePack : languagePackRootDir.listFiles()) {
-        //Pull out only unique languagePacks
+            String fileExt = FilenameUtils.getExtension(languagePack.getName()); 
-        HashSet<String> languagePacks = new HashSet<>();
+            if (!languagePack.isDirectory() && TESSERACT_LANGUAGE_PACK_EXT.equals(fileExt)) {
-        if (languagePackRootDir.exists()) {
+                String packageName = FilenameUtils.getBaseName(languagePack.getName());
-            for (File languagePack : languagePackRootDir.listFiles()) {
+                languagePacks.add(packageName);
                if (languagePack.isDirectory() || !acceptableExtensions.contains(
                        FilenameUtils.getExtension(languagePack.getName()))) {
                    continue;
                }
                String threeLetterPackageName = languagePack.getName().substring(0, 3);
                //Ignore the eng language pack if accidentally added
                languagePacks.add(threeLetterPackageName);
            }
        }
        return String.join("+", languagePacks);
    }
@ -499,8 +495,8 @@ final class TikaTextExtractor extends TextExtractor {
     * Determines how the extraction process will proceed given the settings
     * stored in this context instance.
     *
-     * See the ImageConfig class in the extractionconfigs package
+     * See the ImageConfig class in the extractionconfigs package for available
- for available settings.
+     * settings.
     *
     * @param context Instance containing config classes
     */
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -31,6 +31,7 @@ import org.openide.util.NbBundle.Messages;
 import org.openide.util.lookup.Lookups;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
 import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
 import org.sleuthkit.autopsy.ingest.FileIngestModule;
@ -477,10 +478,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
        private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
            ImageConfig imageConfig = new ImageConfig();
            imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
-            Lookup extractionContext = Lookups.fixed(imageConfig);
+            ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
            Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
            try {
-                Reader specializedReader = TextReaders.getReader(aFile,extractionContext);
+                Reader specializedReader = TextReaders.getReader(aFile, extractionContext);
                //divide into chunks and index
                return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
            } catch (TextReaders.NoTextReaderFound ex) {
--- a/thirdparty/Tesseract-OCR/ambiguous_words.exe
+++ b/thirdparty/Tesseract-OCR/ambiguous_words.exe
--- a/thirdparty/Tesseract-OCR/classifier_tester.exe
+++ b/thirdparty/Tesseract-OCR/classifier_tester.exe
--- a/thirdparty/Tesseract-OCR/cntraining.exe
+++ b/thirdparty/Tesseract-OCR/cntraining.exe
--- a/thirdparty/Tesseract-OCR/combine_tessdata.exe
+++ b/thirdparty/Tesseract-OCR/combine_tessdata.exe
--- a/thirdparty/Tesseract-OCR/dawg2wordlist.exe
+++ b/thirdparty/Tesseract-OCR/dawg2wordlist.exe
--- a/thirdparty/Tesseract-OCR/java/ScrollView.jar
+++ b/thirdparty/Tesseract-OCR/java/ScrollView.jar
--- a/thirdparty/Tesseract-OCR/libgcc_s_sjlj-1.dll
+++ b/thirdparty/Tesseract-OCR/libgcc_s_sjlj-1.dll
--- a/thirdparty/Tesseract-OCR/libgomp-1.dll
+++ b/thirdparty/Tesseract-OCR/libgomp-1.dll
--- a/thirdparty/Tesseract-OCR/libstdc++-6.dll
+++ b/thirdparty/Tesseract-OCR/libstdc++-6.dll
--- a/thirdparty/Tesseract-OCR/libtesseract-4.dll
+++ b/thirdparty/Tesseract-OCR/libtesseract-4.dll
--- a/thirdparty/Tesseract-OCR/lstmeval.exe
+++ b/thirdparty/Tesseract-OCR/lstmeval.exe
--- a/thirdparty/Tesseract-OCR/lstmtraining.exe
+++ b/thirdparty/Tesseract-OCR/lstmtraining.exe
--- a/thirdparty/Tesseract-OCR/mftraining.exe
+++ b/thirdparty/Tesseract-OCR/mftraining.exe
--- a/thirdparty/Tesseract-OCR/set_unicharset_properties.exe
+++ b/thirdparty/Tesseract-OCR/set_unicharset_properties.exe
--- a/thirdparty/Tesseract-OCR/shapeclustering.exe
+++ b/thirdparty/Tesseract-OCR/shapeclustering.exe
--- a/thirdparty/Tesseract-OCR/tessdata/configs/lstm.train
+++ b/thirdparty/Tesseract-OCR/tessdata/configs/lstm.train
@ -0,0 +1,13 @@
 disable_character_fragments T
 file_type                   .bl
 textord_fast_pitch_test	T
 tessedit_single_match	0
 tessedit_zero_rejection T
 tessedit_minimal_rejection F
 tessedit_write_rep_codes F
 il1_adaption_test 1
 edges_children_fix F
 edges_childarea 0.65
 edges_boxarea 0.9
 tessedit_train_line_recognizer T
 textord_no_rejects T
--- a/thirdparty/Tesseract-OCR/tessdata/eng.cube.bigrams
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.cube.bigrams
--- a/thirdparty/Tesseract-OCR/tessdata/eng.cube.fold
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.cube.fold
@ -1,12 +0,0 @@
 0oO
 lI1
 cC
 kK
 pP
 sS
 uU
 vV
 wW
 xX
 yY
 zZ
--- a/thirdparty/Tesseract-OCR/tessdata/eng.cube.lm
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.cube.lm
@ -1,7 +0,0 @@
 LeadPunc="({[`'
 TrailPunc=}:;-]!?`,.)"'
 NumLeadPunc=#({[@$
 NumTrailPunc=}):;].,%
 Operators=*+-/.:,()[]
 Digits=0123456789
 Alphas=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
--- a/thirdparty/Tesseract-OCR/tessdata/eng.cube.nn
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.cube.nn
--- a/thirdparty/Tesseract-OCR/tessdata/eng.cube.params
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.cube.params
@ -1,14 +0,0 @@
 RecoWgt=1.0
 SizeWgt=0.2435
 OODWgt=0.0214
 NumWgt=0.036
 CharBigramsWgt=0.1567
 MaxSegPerChar=8
 BeamWidth=10
 ConvGridSize=48
 WordUnigramsWgt=0.01
 MaxWordAspectRatio=20.0000
 MinSpaceHeightRatio=0.5000
 MaxSpaceHeightRatio=0.6000
 HistWindWid=2
 MinConCompSize=0
--- a/thirdparty/Tesseract-OCR/tessdata/eng.cube.size
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.cube.size
--- a/thirdparty/Tesseract-OCR/tessdata/eng.cube.word-freq
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.cube.word-freq
--- a/thirdparty/Tesseract-OCR/tessdata/eng.tesseract_cube.nn
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.tesseract_cube.nn
--- a/thirdparty/Tesseract-OCR/tessdata/eng.traineddata
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.traineddata
--- a/thirdparty/Tesseract-OCR/tessdata/eng.user-patterns
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.user-patterns
@ -1,2 +0,0 @@
 1-\d\d\d-GOOG-411
 www.\n\\\*.com
--- a/thirdparty/Tesseract-OCR/tessdata/eng.user-words
+++ b/thirdparty/Tesseract-OCR/tessdata/eng.user-words
@ -1,5 +0,0 @@
 the
 quick
 brown
 fox
 jumped
--- a/thirdparty/Tesseract-OCR/tessdata/enm.traineddata
+++ b/thirdparty/Tesseract-OCR/tessdata/enm.traineddata
--- a/thirdparty/Tesseract-OCR/tesseract.exe
+++ b/thirdparty/Tesseract-OCR/tesseract.exe
--- a/thirdparty/Tesseract-OCR/text2image.exe
+++ b/thirdparty/Tesseract-OCR/text2image.exe
--- a/thirdparty/Tesseract-OCR/unicharset_extractor.exe
+++ b/thirdparty/Tesseract-OCR/unicharset_extractor.exe
--- a/thirdparty/Tesseract-OCR/wordlist2dawg.exe
+++ b/thirdparty/Tesseract-OCR/wordlist2dawg.exe