diff --git a/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java b/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java index a785bdf638..88e33c7407 100644 --- a/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java +++ b/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java @@ -36,6 +36,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; import javax.swing.filechooser.FileSystemView; +import org.apache.commons.io.FilenameUtils; import org.hyperic.sigar.Sigar; import org.hyperic.sigar.ptql.ProcessFinder; import org.openide.modules.InstalledFileLocator; @@ -53,6 +54,7 @@ public class PlatformUtil { private static final String PYTHON_MODULES_SUBDIRECTORY = "python_modules"; //NON-NLS private static final String CLASSIFIERS_SUBDIRECTORY = "object_detection_classifiers"; //NON-NLS private static final String OCR_LANGUAGE_SUBDIRECTORY = "ocr_language_packs"; //NON-NLS + private static final String OCR_LANGUAGE_PACK_EXT = "traineddata"; private static String javaPath = null; public static final String OS_NAME_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.nameUnknown"); public static final String OS_VERSION_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.verUnknown"); @@ -126,6 +128,26 @@ public class PlatformUtil { public static String getOcrLanguagePacksPath() { return getUserDirectory().getAbsolutePath() + File.separator + OCR_LANGUAGE_SUBDIRECTORY; } + + /** + * Get the names of the language packs installed at the user directory. + * + * @return List of language packs base names + */ + public static List getOcrLanguagePacks() { + File languagePackRootDir = new File(getOcrLanguagePacksPath()); + + List languagePacks = new ArrayList<>(); + for (File languagePack : languagePackRootDir.listFiles()) { + String fileExt = FilenameUtils.getExtension(languagePack.getName()); + if (!languagePack.isDirectory() && OCR_LANGUAGE_PACK_EXT.equals(fileExt)) { + String packageName = FilenameUtils.getBaseName(languagePack.getName()); + languagePacks.add(packageName); + } + } + + return languagePacks; + } /** * Get root path where the user's object detection classifiers are stored. diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index 8ceb99d0d2..0a05a238c4 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -29,7 +29,6 @@ import java.io.InputStream; import java.io.PushbackReader; import java.io.Reader; import java.nio.file.Paths; -import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.concurrent.Callable; @@ -42,7 +41,6 @@ import java.util.concurrent.TimeoutException; import java.util.logging.Level; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.apache.commons.io.FilenameUtils; import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; @@ -134,8 +132,7 @@ final class TikaTextExtractor implements TextExtractor { private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS private static final File TESSERACT_PATH = locateTesseractExecutable(); - private static final String LANGUAGE_PACKS = getLanguagePacks(); - private static final String TESSERACT_LANGUAGE_PACK_EXT = "traineddata"; //NON-NLS + private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks()); private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS private ProcessTerminator processTerminator; @@ -203,7 +200,7 @@ final class TikaTextExtractor implements TextExtractor { String tesseractFolder = TESSERACT_PATH.getParent(); ocrConfig.setTesseractPath(tesseractFolder); - ocrConfig.setLanguage(LANGUAGE_PACKS); + ocrConfig.setLanguage(languagePacks); ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); parseContext.set(TesseractOCRConfig.class, ocrConfig); @@ -289,7 +286,7 @@ final class TikaTextExtractor implements TextExtractor { String.format("\"%s\"", outputFilePath), "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(), //language pack command flag - "-l", LANGUAGE_PACKS); + "-l", languagePacks); //If the ProcessTerminator was supplied during //configuration apply it here. @@ -443,23 +440,11 @@ final class TikaTextExtractor implements TextExtractor { } /** - * Retrieves all of the installed language packs from their designated - * directory location to be used to configure Tesseract OCR. + * Formats language packs to be parseable from the command line. * * @return String of all language packs available for Tesseract to use */ - private static String getLanguagePacks() { - File languagePackRootDir = new File(PlatformUtil.getOcrLanguagePacksPath()); - - List languagePacks = new ArrayList<>(); - for (File languagePack : languagePackRootDir.listFiles()) { - String fileExt = FilenameUtils.getExtension(languagePack.getName()); - if (!languagePack.isDirectory() && TESSERACT_LANGUAGE_PACK_EXT.equals(fileExt)) { - String packageName = FilenameUtils.getBaseName(languagePack.getName()); - languagePacks.add(packageName); - } - } - + private static String formatLanguagePacks(List languagePacks) { return String.join("+", languagePacks); } @@ -499,8 +484,14 @@ final class TikaTextExtractor implements TextExtractor { public void setExtractionSettings(Lookup context) { if (context != null) { ImageConfig configInstance = context.lookup(ImageConfig.class); - if (configInstance != null && Objects.nonNull(configInstance.getOCREnabled())) { - this.tesseractOCREnabled = configInstance.getOCREnabled(); + if (configInstance != null) { + if(Objects.nonNull(configInstance.getOCREnabled())) { + this.tesseractOCREnabled = configInstance.getOCREnabled(); + } + + if(Objects.nonNull(configInstance.getOCRLanguages())) { + this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages()); + } } ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class); diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/configs/ImageConfig.java b/Core/src/org/sleuthkit/autopsy/textextractors/configs/ImageConfig.java index 33d7987537..3abe178287 100755 --- a/Core/src/org/sleuthkit/autopsy/textextractors/configs/ImageConfig.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/configs/ImageConfig.java @@ -18,19 +18,22 @@ */ package org.sleuthkit.autopsy.textextractors.configs; +import java.util.List; + /** - * Allows for configuration of OCR on image files. Extractors that use ImageConfig - * can be obtained through TextExtractoryFactory.getExtractor(). + * Allows for configuration of OCR on image files. Extractors that use + * ImageConfig can be obtained through TextExtractoryFactory.getExtractor(). * * @see org.openide.util.Lookup */ public class ImageConfig { private Boolean OCREnabled; + private List ocrLanguages; /** - * Enables OCR to be run on the text reader responsible for handling - * image files. + * Enables OCR to be run on the text reader responsible for handling image + * files. * * @param enabled Flag indicating if OCR is enabled. */ @@ -46,4 +49,24 @@ public class ImageConfig { public boolean getOCREnabled() { return this.OCREnabled; } + + /** + * Sets languages for OCR. + * + * See PlatformUtil for list of installed language packs. + * + * @param languages List of languages to use + */ + public void setOCRLanguages(List languages) { + this.ocrLanguages = languages; + } + + /** + * Gets the list of languages OCR should perform. + * + * @return Collection of OCR languages + */ + public List getOCRLanguages() { + return this.ocrLanguages; + } }