Merge pull request #3760 from eugene7646/ocr_via_tika_3760

OCR via tika (3760)
This commit is contained in:
Richard Cordovano 2018-05-14 11:38:16 -04:00 committed by GitHub
commit 248e724161
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
100 changed files with 376404 additions and 26 deletions

View File

@ -39,6 +39,11 @@
<copy todir="${basedir}/release/Volatility" >
<fileset dir="${thirdparty.dir}/Volatility"/>
</copy>
<!--Copy Tesseract OCR to release-->
<copy todir="${basedir}/release/Tesseract-OCR" >
<fileset dir="${thirdparty.dir}/Tesseract-OCR"/>
</copy>
<!--Copy other jars-->
<copy file="${thirdparty.dir}/rejistry/Rejistry-1.0-SNAPSHOT.jar" todir="${ext.dir}" />

View File

@ -218,6 +218,7 @@ KeywordSearchJobSettingsPanel.languagesLabel.text=Scripts enabled for string ext
KeywordSearchGlobalLanguageSettingsPanel.enableUTF8Checkbox.text=Enable UTF8 text extraction
KeywordSearchGlobalLanguageSettingsPanel.ingestSettingsLabel.text=Ingest settings for string extraction from unknown file types (changes effective on next ingest):
KeywordSearchGlobalLanguageSettingsPanel.enableUTF16Checkbox.text=Enable UTF16LE and UTF16BE string extraction
KeywordSearchGlobalLanguageSettingsPanel.enableOcrCheckbox.text=Enable Optical Character Recognition (OCR)
KeywordSearchGlobalLanguageSettingsPanel.languagesLabel.text=Enabled scripts (languages):
KeywordSearchGlobalSearchSettingsPanel.timeRadioButton1.toolTipText=20 mins. (fastest ingest time)
KeywordSearchGlobalSearchSettingsPanel.timeRadioButton1.text=20 minutes (slowest feedback, fastest ingest)
@ -309,3 +310,4 @@ ExtractedContentPanel.pageOfLabel.text=of
ExtractedContentPanel.pageTotalLabel.text=-
ExtractedContentPanel.pageButtonsLabel.text=Page
ExtractedContentPanel.pagesLabel.text=Page:

View File

@ -16,21 +16,26 @@
<Layout>
<DimensionLayout dim="0">
<Group type="103" groupAlignment="0" attributes="0">
<Group type="102" alignment="0" attributes="0">
<EmptySpace max="-2" attributes="0"/>
<Group type="102" attributes="0">
<Group type="103" groupAlignment="0" attributes="0">
<Component id="ingestSettingsLabel" alignment="0" min="-2" max="-2" attributes="0"/>
<Group type="102" attributes="0">
<EmptySpace min="10" pref="10" max="-2" attributes="0"/>
<Group type="102" alignment="0" attributes="0">
<EmptySpace max="-2" attributes="0"/>
<Group type="103" groupAlignment="0" attributes="0">
<Component id="ingestSettingsLabel" alignment="0" min="-2" max="-2" attributes="0"/>
<Group type="103" alignment="0" groupAlignment="1" attributes="0">
<Component id="languagesLabel" alignment="0" min="-2" max="-2" attributes="0"/>
<Component id="langPanel" min="-2" max="-2" attributes="0"/>
</Group>
</Group>
</Group>
<Group type="102" alignment="0" attributes="0">
<EmptySpace min="-2" pref="26" max="-2" attributes="0"/>
<Group type="103" groupAlignment="0" max="-2" attributes="0">
<Component id="enableUTF16Checkbox" min="-2" max="-2" attributes="0"/>
<Component id="enableUTF8Checkbox" alignment="0" min="-2" max="-2" attributes="0"/>
<Component id="enableOcrCheckbox" alignment="0" min="-2" max="-2" attributes="0"/>
</Group>
</Group>
<Group type="103" alignment="0" groupAlignment="1" attributes="0">
<Component id="languagesLabel" alignment="0" min="-2" max="-2" attributes="0"/>
<Component id="langPanel" min="-2" max="-2" attributes="0"/>
</Group>
</Group>
<EmptySpace pref="255" max="32767" attributes="0"/>
</Group>
@ -42,13 +47,15 @@
<EmptySpace max="-2" attributes="0"/>
<Component id="ingestSettingsLabel" min="-2" max="-2" attributes="0"/>
<EmptySpace type="unrelated" max="-2" attributes="0"/>
<Component id="enableOcrCheckbox" min="-2" max="-2" attributes="0"/>
<EmptySpace max="32767" attributes="0"/>
<Component id="enableUTF16Checkbox" min="-2" max="-2" attributes="0"/>
<EmptySpace max="-2" attributes="0"/>
<Component id="enableUTF8Checkbox" min="-2" max="-2" attributes="0"/>
<EmptySpace type="separate" max="-2" attributes="0"/>
<Component id="languagesLabel" min="-2" max="-2" attributes="0"/>
<EmptySpace type="unrelated" max="-2" attributes="0"/>
<Component id="langPanel" pref="397" max="32767" attributes="0"/>
<Component id="languagesLabel" min="-2" max="-2" attributes="0"/>
<EmptySpace max="-2" attributes="0"/>
<Component id="langPanel" min="-2" pref="380" max="-2" attributes="0"/>
<EmptySpace max="-2" attributes="0"/>
</Group>
</Group>
@ -86,7 +93,7 @@
</DimensionLayout>
<DimensionLayout dim="1">
<Group type="103" groupAlignment="0" attributes="0">
<EmptySpace min="0" pref="395" max="32767" attributes="0"/>
<EmptySpace min="0" pref="378" max="32767" attributes="0"/>
</Group>
</DimensionLayout>
</Layout>
@ -120,5 +127,15 @@
</Property>
</Properties>
</Component>
<Component class="javax.swing.JCheckBox" name="enableOcrCheckbox">
<Properties>
<Property name="text" type="java.lang.String" editor="org.netbeans.modules.i18n.form.FormI18nStringEditor">
<ResourceString bundle="org/sleuthkit/autopsy/keywordsearch/Bundle.properties" key="KeywordSearchGlobalLanguageSettingsPanel.enableOcrCheckbox.text" replaceFormat="org.openide.util.NbBundle.getMessage({sourceFileName}.class, &quot;{key}&quot;)"/>
</Property>
</Properties>
<Events>
<EventHandler event="actionPerformed" listener="java.awt.event.ActionListener" parameters="java.awt.event.ActionEvent" handler="enableOcrCheckboxActionPerformed"/>
</Events>
</Component>
</SubComponents>
</Form>

View File

@ -40,7 +40,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
private final Map<String, StringExtract.StringExtractUnicodeTable.SCRIPT> scripts = new HashMap<>();
private ActionListener updateLanguagesAction;
private List<SCRIPT> toUpdate;
KeywordSearchGlobalLanguageSettingsPanel() {
initComponents();
customizeComponents();
@ -111,6 +111,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
enableUTF8Checkbox.setSelected(utf8);
boolean ocr = KeywordSearchSettings.getOcrOption();
enableOcrCheckbox.setSelected(ocr);
final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
final int components = checkPanel.getComponentCount();
@ -141,6 +144,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
activateScriptsCheckboxes(extractEnabled && ingestNotRunning);
enableUTF16Checkbox.setEnabled(ingestNotRunning);
enableUTF8Checkbox.setEnabled(ingestNotRunning);
enableOcrCheckbox.setEnabled(ingestNotRunning);
}
/**
@ -158,6 +162,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
enableUTF8Checkbox = new javax.swing.JCheckBox();
enableUTF16Checkbox = new javax.swing.JCheckBox();
ingestSettingsLabel = new javax.swing.JLabel();
enableOcrCheckbox = new javax.swing.JCheckBox();
org.openide.awt.Mnemonics.setLocalizedText(languagesLabel, org.openide.util.NbBundle.getMessage(KeywordSearchGlobalLanguageSettingsPanel.class, "KeywordSearchGlobalLanguageSettingsPanel.languagesLabel.text")); // NOI18N
@ -173,7 +178,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
);
checkPanelLayout.setVerticalGroup(
checkPanelLayout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addGap(0, 395, Short.MAX_VALUE)
.addGap(0, 378, Short.MAX_VALUE)
);
langPanel.setViewportView(checkPanel);
@ -194,22 +199,32 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
org.openide.awt.Mnemonics.setLocalizedText(ingestSettingsLabel, org.openide.util.NbBundle.getMessage(KeywordSearchGlobalLanguageSettingsPanel.class, "KeywordSearchGlobalLanguageSettingsPanel.ingestSettingsLabel.text")); // NOI18N
org.openide.awt.Mnemonics.setLocalizedText(enableOcrCheckbox, org.openide.util.NbBundle.getMessage(KeywordSearchGlobalLanguageSettingsPanel.class, "KeywordSearchGlobalLanguageSettingsPanel.enableOcrCheckbox.text")); // NOI18N
enableOcrCheckbox.addActionListener(new java.awt.event.ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
enableOcrCheckboxActionPerformed(evt);
}
});
javax.swing.GroupLayout layout = new javax.swing.GroupLayout(this);
this.setLayout(layout);
layout.setHorizontalGroup(
layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addGroup(layout.createSequentialGroup()
.addContainerGap()
.addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addComponent(ingestSettingsLabel)
.addGroup(layout.createSequentialGroup()
.addGap(10, 10, 10)
.addContainerGap()
.addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addComponent(ingestSettingsLabel)
.addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.TRAILING)
.addComponent(languagesLabel, javax.swing.GroupLayout.Alignment.LEADING)
.addComponent(langPanel, javax.swing.GroupLayout.PREFERRED_SIZE, javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.PREFERRED_SIZE))))
.addGroup(layout.createSequentialGroup()
.addGap(26, 26, 26)
.addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING, false)
.addComponent(enableUTF16Checkbox)
.addComponent(enableUTF8Checkbox)))
.addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.TRAILING)
.addComponent(languagesLabel, javax.swing.GroupLayout.Alignment.LEADING)
.addComponent(langPanel, javax.swing.GroupLayout.PREFERRED_SIZE, javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.PREFERRED_SIZE)))
.addComponent(enableUTF8Checkbox)
.addComponent(enableOcrCheckbox))))
.addContainerGap(255, Short.MAX_VALUE))
);
layout.setVerticalGroup(
@ -218,13 +233,15 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
.addContainerGap()
.addComponent(ingestSettingsLabel)
.addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.UNRELATED)
.addComponent(enableOcrCheckbox)
.addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED, javax.swing.GroupLayout.DEFAULT_SIZE, Short.MAX_VALUE)
.addComponent(enableUTF16Checkbox)
.addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
.addComponent(enableUTF8Checkbox)
.addGap(18, 18, 18)
.addComponent(languagesLabel)
.addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.UNRELATED)
.addComponent(langPanel, javax.swing.GroupLayout.DEFAULT_SIZE, 397, Short.MAX_VALUE)
.addComponent(languagesLabel)
.addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
.addComponent(langPanel, javax.swing.GroupLayout.PREFERRED_SIZE, 380, javax.swing.GroupLayout.PREFERRED_SIZE)
.addContainerGap())
);
}// </editor-fold>//GEN-END:initComponents
@ -246,8 +263,13 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
firePropertyChange(OptionsPanelController.PROP_CHANGED, null, null);
}//GEN-LAST:event_enableUTF16CheckboxActionPerformed
private void enableOcrCheckboxActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_enableOcrCheckboxActionPerformed
firePropertyChange(OptionsPanelController.PROP_CHANGED, null, null);
}//GEN-LAST:event_enableOcrCheckboxActionPerformed
// Variables declaration - do not modify//GEN-BEGIN:variables
private javax.swing.JPanel checkPanel;
private javax.swing.JCheckBox enableOcrCheckbox;
private javax.swing.JCheckBox enableUTF16Checkbox;
private javax.swing.JCheckBox enableUTF8Checkbox;
private javax.swing.JLabel ingestSettingsLabel;
@ -261,6 +283,7 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
Boolean.toString(enableUTF8Checkbox.isSelected()));
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
Boolean.toString(enableUTF16Checkbox.isSelected()));
KeywordSearchSettings.setOcrOption(enableOcrCheckbox.isSelected());
if (toUpdate != null) {
KeywordSearchSettings.setStringExtractScripts(toUpdate);

View File

@ -38,7 +38,9 @@ class KeywordSearchSettings {
static final String PROPERTIES_NSRL = NbBundle.getMessage(KeywordSearchSettings.class, "KeywordSearchSettings.propertiesNSRL.text", MODULE_NAME);
static final String PROPERTIES_SCRIPTS = NbBundle.getMessage(KeywordSearchSettings.class, "KeywordSearchSettings.propertiesScripts.text", MODULE_NAME);
static final String SHOW_SNIPPETS = "showSnippets"; //NON-NLS
static final boolean DEFAULT_SHOW_SNIPPETS = true;
static final boolean DEFAULT_SHOW_SNIPPETS = true;
static final String OCR_ENABLED = "ocrEnabled"; //NON-NLS
static final boolean OCR_ENABLED_DEFAULT = false; // NON-NLS
private static boolean skipKnown = true;
private static final Logger logger = Logger.getLogger(KeywordSearchSettings.class.getName());
private static UpdateFrequency UpdateFreq = UpdateFrequency.DEFAULT;
@ -127,7 +129,27 @@ class KeywordSearchSettings {
stringExtractOptions.put(key, val);
ModuleSettings.setConfigSetting(PROPERTIES_OPTIONS, key, val);
}
/**
* Save OCR setting to permanent storage
* @param enabled
*/
static void setOcrOption(boolean enabled) {
ModuleSettings.setConfigSetting(PROPERTIES_OPTIONS, OCR_ENABLED, (enabled ? "true" : "false")); //NON-NLS
}
/**
* Get OCR setting from permanent storage
* @return
*/
static boolean getOcrOption() {
if (ModuleSettings.settingExists(PROPERTIES_OPTIONS, OCR_ENABLED)) {
return ModuleSettings.getConfigSetting(PROPERTIES_OPTIONS, OCR_ENABLED).equals("true"); //NON-NLS
} else {
return OCR_ENABLED_DEFAULT;
}
}
static void setShowSnippets(boolean showSnippets) {
ModuleSettings.setConfigSetting(PROPERTIES_OPTIONS, SHOW_SNIPPETS, (showSnippets ? "true" : "false")); //NON-NLS
}
@ -219,6 +241,11 @@ class KeywordSearchSettings {
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
}
//setting OCR default (disabled by default)
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, OCR_ENABLED)) {
logger.log(Level.INFO, "No configuration for OCR found, generating defaults..."); //NON-NLS
KeywordSearchSettings.setOcrOption(OCR_ENABLED_DEFAULT);
}
//setting default Latin-1 Script
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
logger.log(Level.INFO, "No configuration for Scripts found, generating defaults..."); //NON-NLS

View File

@ -19,9 +19,11 @@
package org.sleuthkit.autopsy.keywordsearch;
import com.google.common.io.CharSource;
import java.io.File;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@ -38,8 +40,12 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.openide.util.NbBundle;
import org.openide.modules.InstalledFileLocator;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream;
@ -53,6 +59,10 @@ class TikaTextExtractor extends ContentTextExtractor {
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private final AutoDetectParser parser = new AutoDetectParser();
private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
private static final File TESSERACT_PATH = locateTesseractExecutable();
private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext())
@ -79,6 +89,30 @@ class TikaTextExtractor extends ContentTextExtractor {
officeParserConfig.setUseSAXPptxExtractor(true);
officeParserConfig.setUseSAXDocxExtractor(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig);
// configure OCR if it is enabled in KWS settings and installed on the machine
if (TESSERACT_PATH != null && KeywordSearchSettings.getOcrOption() && PlatformUtil.isWindowsOS() == true) {
// configure PDFParser.
PDFParserConfig pdfConfig = new PDFParserConfig();
// Extracting the inline images and letting Tesseract run on each inline image.
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
pdfConfig.setExtractInlineImages(true);
// Multiple pages within a PDF file might refer to the same underlying image.
pdfConfig.setExtractUniqueInlineImagesOnly(true);
parseContext.set(PDFParserConfig.class, pdfConfig);
// Configure Tesseract parser to perform OCR
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder);
// Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
// If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
ocrConfig.setLanguage("eng");
parseContext.set(TesseractOCRConfig.class, ocrConfig);
}
//Parse the file in a task, a convenient way to have a timeout...
final Future<Reader> future = tikaParseExecutor.submit(() -> new ParsingReader(parser, stream, metadata, parseContext));
@ -112,6 +146,29 @@ class TikaTextExtractor extends ContentTextExtractor {
}
}
/**
* Finds and returns the path to the Tesseract executable, if able.
*
* @return A File reference or null.
*/
private static File locateTesseractExecutable() {
if (!PlatformUtil.isWindowsOS()) {
return null;
}
String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
if (null == exeFile) {
return null;
}
if (!exeFile.canExecute()) {
return null;
}
return exeFile;
}
/**
* Gets a CharSource that wraps a formated representation of the given
* Metadata.

BIN
thirdparty/Tesseract-OCR/ambiguous_words.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/classifier_tester.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/cntraining.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/combine_tessdata.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/dawg2wordlist.exe vendored Executable file

Binary file not shown.

42
thirdparty/Tesseract-OCR/doc/AUTHORS vendored Executable file
View File

@ -0,0 +1,42 @@
Ray Smith (lead developer) <theraysmith@gmail.com>
Ahmad Abdulkader
Rika Antonova
Nicholas Beato
Jeff Breidenbach
Samuel Charron
Phil Cheatle
Simon Crouch
David Eger
Sheelagh Huddleston
Dan Johnson
Rajesh Katikam
Thomas Kielbus
Dar-Shyang Lee
Zongyi (Joe) Liu
Robert Moss
Chris Newton
Michael Reimer
Marius Renn
Raquel Romano
Christy Russon
Shobhit Saxena
Mark Seaman
Faisal Shafait
Hiroshi Takenaka
Ranjith Unnikrishnan
Joern Wanke
Ping Ping Xiu
Andrew Ziem
Oscar Zuniga
Community Contributors:
Zdenko Podobný (Maintainer)
Jim Regan (Maintainer)
James R Barlow
Amit Dovev
Martin Ettl
Tom Morris
Tobias Müller
Egor Pugin
Sundar M. Vaidya
Stefan Weil

21
thirdparty/Tesseract-OCR/doc/COPYING vendored Executable file
View File

@ -0,0 +1,21 @@
This package contains the Tesseract Open Source OCR Engine.
Originally developed at Hewlett Packard Laboratories Bristol and
at Hewlett Packard Co, Greeley Colorado, all the code
in this distribution is now licensed under the Apache License:
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
Other Dependencies and Licenses:
================================
Tesseract uses Leptonica library (http://leptonica.com/) which essentially
uses a BSD 2-clause license. (http://leptonica.com/about-the-license.html)

43
thirdparty/Tesseract-OCR/doc/README vendored Executable file
View File

@ -0,0 +1,43 @@
How to run UNLV tests.
The scripts in this directory make it possible to duplicate the tests
published in the Fourth Annual Test of OCR Accuracy.
See http://www.isri.unlv.edu/downloads/AT-1995.pdf
but first you have to get the tools and data from UNLV:
Step 1: to download the images goto
http://www.isri.unlv.edu/ISRI/OCRtk
and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz.
Step 2: extract the files. It doesn't really matter where
in your filesystem you put them, but they must go under a common
root so you have directories 3, B, M and N in, for example,
/users/me/ISRI-OCRtk.
Step 3: Reorg the files
The lack of tif extensions on the images is inconvenient, so there
is a script to reorganize the data to match the rest of the test
scripts.
cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run
/blah/blah/tesseract-ocr/testing/reorgdata.sh 3B
This makes directories doe3.3B, bus.3B, mag.3B and news.3B.
You can now get rid of 3, B, M, and N unless you want to get some of the
other scanning resolutions out of them.
Step 4: Download the ISRI toolkit from:
http://www.isri.unlv.edu/downloads/ftk-1.0.tgz
Step 5: If they work for you, use the binaries directly from the bin
directory and put them in tesseract-ocr/testing/unlv
otherwise build the tools for yourself and put them there.
Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
Step 7: run testing/runalltests.sh with the root data dir and testname:
testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0
and go to the gym, have lunch etc.
Step 8: There should be a file
testing/reports/tess2.0.summary that contains the final summarized accuracy
report and comparison with the 1995 results.

BIN
thirdparty/Tesseract-OCR/doc/eurotext.tif vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/doc/phototest.tif vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/iconv.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/icudata51.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/icui18n51.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/icuuc51.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/java/ScrollView.jar vendored Executable file

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libbz2-1.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libcairo-2.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libexpat-1.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libffi-6.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libfontconfig-1.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libfreetype-6.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libgcc_s_sjlj-1.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libgif-4.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libglib-2.0-0.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libgobject-2.0-0.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libgomp-1.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libharfbuzz-0.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libintl-8.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libjbig-2.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libjpeg-8.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/liblept-5.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/liblzma-5.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libopenjp2.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libpango-1.0-0.dll vendored Executable file

Binary file not shown.

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libpangoft2-1.0-0.dll vendored Executable file

Binary file not shown.

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libpixman-1-0.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libpng16-16.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libstdc++-6.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libtesseract-3.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libtiff-5.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libwebp-5.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/libwinpthread-1.dll vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/mftraining.exe vendored Executable file

Binary file not shown.

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/shapeclustering.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/tar.exe vendored Executable file

Binary file not shown.

View File

@ -0,0 +1,7 @@
tessedit_ambigs_training 1
load_freq_dawg 0
load_punc_dawg 0
load_system_dawg 0
load_number_dawg 0
ambigs_debug_level 3
load_fixed_length_dawgs 0

View File

@ -0,0 +1 @@
tessedit_zero_rejection T

View File

@ -0,0 +1,5 @@
load_bigram_dawg True
tessedit_enable_bigram_correction True
tessedit_bigram_debug 3
save_raw_choices True
save_alt_choices True

View File

@ -0,0 +1,14 @@
disable_character_fragments T
file_type .bl
textord_fast_pitch_test T
tessedit_single_match 0
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
il1_adaption_test 1
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_resegment_from_boxes T
tessedit_train_from_boxes T
textord_no_rejects T

View File

@ -0,0 +1,15 @@
file_type .bl
#tessedit_use_nn F
textord_fast_pitch_test T
tessedit_single_match 0
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
il1_adaption_test 1
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_resegment_from_boxes T
tessedit_train_from_boxes T
#textord_repeat_extraction F
textord_no_rejects T

View File

@ -0,0 +1 @@
tessedit_char_whitelist 0123456789-.

View File

@ -0,0 +1,3 @@
tessedit_create_hocr 1
tessedit_pageseg_mode 1
hocr_font_info 0

View File

@ -0,0 +1,2 @@
interactive_display_mode T
tessedit_display_outwords T

View File

@ -0,0 +1,4 @@
textord_skewsmooth_offset 8
textord_skewsmooth_offset2 8
textord_merge_desc 0.5
textord_no_rejects 1

View File

@ -0,0 +1,2 @@
tessedit_resegment_from_line_boxes 1
tessedit_make_boxes_from_boxes 1

View File

@ -0,0 +1 @@
debug_file tesseract.log

View File

@ -0,0 +1 @@
tessedit_create_boxfile 1

View File

@ -0,0 +1,2 @@
tessedit_create_pdf 1
tessedit_pageseg_mode 1

View File

@ -0,0 +1 @@
debug_file /dev/null

View File

@ -0,0 +1,2 @@
tessedit_resegment_from_boxes 1
tessedit_make_boxes_from_boxes 1

View File

@ -0,0 +1,12 @@
textord_show_blobs 0
textord_debug_tabfind 3
textord_tabfind_show_partitions 1
textord_tabfind_show_initial_partitions 1
textord_tabfind_show_columns 1
textord_tabfind_show_blocks 1
textord_tabfind_show_initialtabs 1
textord_tabfind_show_finaltabs 1
textord_tabfind_show_strokewidths 1
textord_tabfind_show_vlines 0
textord_tabfind_show_images 1
tessedit_dump_pageseg_images 0

View File

@ -0,0 +1,2 @@
tessedit_create_tsv 1
tessedit_pageseg_mode 1

View File

@ -0,0 +1,3 @@
# This config file should be used with other cofig files which creates renderers.
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
tessedit_create_txt 1

View File

@ -0,0 +1,2 @@
tessedit_write_unlv 1
tessedit_pageseg_mode 6

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
0oO
lI1
cC
kK
pP
sS
uU
vV
wW
xX
yY
zZ

View File

@ -0,0 +1,7 @@
LeadPunc="({[`'
TrailPunc=}:;-]!?`,.)"'
NumLeadPunc=#({[@$
NumTrailPunc=}):;].,%
Operators=*+-/.:,()[]
Digits=0123456789
Alphas=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

BIN
thirdparty/Tesseract-OCR/tessdata/eng.cube.nn vendored Executable file

Binary file not shown.

View File

@ -0,0 +1,14 @@
RecoWgt=1.0
SizeWgt=0.2435
OODWgt=0.0214
NumWgt=0.036
CharBigramsWgt=0.1567
MaxSegPerChar=8
BeamWidth=10
ConvGridSize=48
WordUnigramsWgt=0.01
MaxWordAspectRatio=20.0000
MinSpaceHeightRatio=0.5000
MaxSpaceHeightRatio=0.6000
HistWindWid=2
MinConCompSize=0

194633
thirdparty/Tesseract-OCR/tessdata/eng.cube.size vendored Executable file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,2 @@
1-\d\d\d-GOOG-411
www.\n\\\*.com

View File

@ -0,0 +1,5 @@
the
quick
brown
fox
jumped

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/tessdata/pdf.ttf vendored Executable file

Binary file not shown.

View File

@ -0,0 +1,2 @@
# No content needed as all defaults are correct.

View File

@ -0,0 +1,2 @@
chop_enable 0
wordrec_enable_assoc 0

View File

@ -0,0 +1,7 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
classify_enable_adaptive_debugger 1
matcher_debug_flags 6
matcher_debug_level 1

View File

@ -0,0 +1,13 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
classify_enable_adaptive_debugger 1
matcher_debug_flags 6
matcher_debug_level 1
wordrec_display_splits 0
wordrec_display_all_words 1
wordrec_display_all_blobs 1
wordrec_display_segmentations 2
classify_debug_level 1

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,10 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
wordrec_display_splits 0
wordrec_display_all_words 1
wordrec_display_all_blobs 1
wordrec_display_segmentations 2
classify_debug_level 1
stopper_debug_level 1

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/tesseract.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/text2image.exe vendored Executable file

Binary file not shown.

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/wordlist2dawg.exe vendored Executable file

Binary file not shown.

BIN
thirdparty/Tesseract-OCR/zlib1.dll vendored Executable file

Binary file not shown.