Merge pull request #4478 from dannysmyda/4656-allow-user-to-pick-language

4656 - Add OCR language pack configuration
This commit is contained in:
Richard Cordovano 2019-01-25 15:06:09 -05:00 committed by GitHub
commit 63bb7f805e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 62 additions and 26 deletions

View File

@ -36,6 +36,7 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import javax.swing.filechooser.FileSystemView; import javax.swing.filechooser.FileSystemView;
import org.apache.commons.io.FilenameUtils;
import org.hyperic.sigar.Sigar; import org.hyperic.sigar.Sigar;
import org.hyperic.sigar.ptql.ProcessFinder; import org.hyperic.sigar.ptql.ProcessFinder;
import org.openide.modules.InstalledFileLocator; import org.openide.modules.InstalledFileLocator;
@ -53,6 +54,7 @@ public class PlatformUtil {
private static final String PYTHON_MODULES_SUBDIRECTORY = "python_modules"; //NON-NLS private static final String PYTHON_MODULES_SUBDIRECTORY = "python_modules"; //NON-NLS
private static final String CLASSIFIERS_SUBDIRECTORY = "object_detection_classifiers"; //NON-NLS private static final String CLASSIFIERS_SUBDIRECTORY = "object_detection_classifiers"; //NON-NLS
private static final String OCR_LANGUAGE_SUBDIRECTORY = "ocr_language_packs"; //NON-NLS private static final String OCR_LANGUAGE_SUBDIRECTORY = "ocr_language_packs"; //NON-NLS
private static final String OCR_LANGUAGE_PACK_EXT = "traineddata";
private static String javaPath = null; private static String javaPath = null;
public static final String OS_NAME_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.nameUnknown"); public static final String OS_NAME_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.nameUnknown");
public static final String OS_VERSION_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.verUnknown"); public static final String OS_VERSION_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.verUnknown");
@ -127,6 +129,26 @@ public class PlatformUtil {
return getUserDirectory().getAbsolutePath() + File.separator + OCR_LANGUAGE_SUBDIRECTORY; return getUserDirectory().getAbsolutePath() + File.separator + OCR_LANGUAGE_SUBDIRECTORY;
} }
/**
* Get the names of the language packs installed at the user directory.
*
* @return List of language packs base names
*/
public static List<String> getOcrLanguagePacks() {
File languagePackRootDir = new File(getOcrLanguagePacksPath());
List<String> languagePacks = new ArrayList<>();
for (File languagePack : languagePackRootDir.listFiles()) {
String fileExt = FilenameUtils.getExtension(languagePack.getName());
if (!languagePack.isDirectory() && OCR_LANGUAGE_PACK_EXT.equals(fileExt)) {
String packageName = FilenameUtils.getBaseName(languagePack.getName());
languagePacks.add(packageName);
}
}
return languagePacks;
}
/** /**
* Get root path where the user's object detection classifiers are stored. * Get root path where the user's object detection classifiers are stored.
* *

View File

@ -29,7 +29,6 @@ import java.io.InputStream;
import java.io.PushbackReader; import java.io.PushbackReader;
import java.io.Reader; import java.io.Reader;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
@ -42,7 +41,6 @@ import java.util.concurrent.TimeoutException;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParser;
@ -134,8 +132,7 @@ final class TikaTextExtractor implements TextExtractor {
private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final File TESSERACT_PATH = locateTesseractExecutable();
private static final String LANGUAGE_PACKS = getLanguagePacks(); private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
private static final String TESSERACT_LANGUAGE_PACK_EXT = "traineddata"; //NON-NLS
private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
private ProcessTerminator processTerminator; private ProcessTerminator processTerminator;
@ -203,7 +200,7 @@ final class TikaTextExtractor implements TextExtractor {
String tesseractFolder = TESSERACT_PATH.getParent(); String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder); ocrConfig.setTesseractPath(tesseractFolder);
ocrConfig.setLanguage(LANGUAGE_PACKS); ocrConfig.setLanguage(languagePacks);
ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
parseContext.set(TesseractOCRConfig.class, ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig);
@ -289,7 +286,7 @@ final class TikaTextExtractor implements TextExtractor {
String.format("\"%s\"", outputFilePath), String.format("\"%s\"", outputFilePath),
"--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(), "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
//language pack command flag //language pack command flag
"-l", LANGUAGE_PACKS); "-l", languagePacks);
//If the ProcessTerminator was supplied during //If the ProcessTerminator was supplied during
//configuration apply it here. //configuration apply it here.
@ -443,23 +440,11 @@ final class TikaTextExtractor implements TextExtractor {
} }
/** /**
* Retrieves all of the installed language packs from their designated * Formats language packs to be parseable from the command line.
* directory location to be used to configure Tesseract OCR.
* *
* @return String of all language packs available for Tesseract to use * @return String of all language packs available for Tesseract to use
*/ */
private static String getLanguagePacks() { private static String formatLanguagePacks(List<String> languagePacks) {
File languagePackRootDir = new File(PlatformUtil.getOcrLanguagePacksPath());
List<String> languagePacks = new ArrayList<>();
for (File languagePack : languagePackRootDir.listFiles()) {
String fileExt = FilenameUtils.getExtension(languagePack.getName());
if (!languagePack.isDirectory() && TESSERACT_LANGUAGE_PACK_EXT.equals(fileExt)) {
String packageName = FilenameUtils.getBaseName(languagePack.getName());
languagePacks.add(packageName);
}
}
return String.join("+", languagePacks); return String.join("+", languagePacks);
} }
@ -499,10 +484,16 @@ final class TikaTextExtractor implements TextExtractor {
public void setExtractionSettings(Lookup context) { public void setExtractionSettings(Lookup context) {
if (context != null) { if (context != null) {
ImageConfig configInstance = context.lookup(ImageConfig.class); ImageConfig configInstance = context.lookup(ImageConfig.class);
if (configInstance != null && Objects.nonNull(configInstance.getOCREnabled())) { if (configInstance != null) {
if(Objects.nonNull(configInstance.getOCREnabled())) {
this.tesseractOCREnabled = configInstance.getOCREnabled(); this.tesseractOCREnabled = configInstance.getOCREnabled();
} }
if(Objects.nonNull(configInstance.getOCRLanguages())) {
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
}
}
ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class); ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
if (terminatorInstance != null) { if (terminatorInstance != null) {
this.processTerminator = terminatorInstance; this.processTerminator = terminatorInstance;

View File

@ -18,19 +18,22 @@
*/ */
package org.sleuthkit.autopsy.textextractors.configs; package org.sleuthkit.autopsy.textextractors.configs;
import java.util.List;
/** /**
* Allows for configuration of OCR on image files. Extractors that use ImageConfig * Allows for configuration of OCR on image files. Extractors that use
* can be obtained through TextExtractoryFactory.getExtractor(). * ImageConfig can be obtained through TextExtractoryFactory.getExtractor().
* *
* @see org.openide.util.Lookup * @see org.openide.util.Lookup
*/ */
public class ImageConfig { public class ImageConfig {
private Boolean OCREnabled; private Boolean OCREnabled;
private List<String> ocrLanguages;
/** /**
* Enables OCR to be run on the text reader responsible for handling * Enables OCR to be run on the text reader responsible for handling image
* image files. * files.
* *
* @param enabled Flag indicating if OCR is enabled. * @param enabled Flag indicating if OCR is enabled.
*/ */
@ -46,4 +49,24 @@ public class ImageConfig {
public boolean getOCREnabled() { public boolean getOCREnabled() {
return this.OCREnabled; return this.OCREnabled;
} }
/**
* Sets languages for OCR.
*
* See PlatformUtil for list of installed language packs.
*
* @param languages List of languages to use
*/
public void setOCRLanguages(List<String> languages) {
this.ocrLanguages = languages;
}
/**
* Gets the list of languages OCR should perform.
*
* @return Collection of OCR languages
*/
public List<String> getOCRLanguages() {
return this.ocrLanguages;
}
} }