From ec06a54dbfdccc92456ad6d8159f2f95ee89ed9b Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dsmyda" Date: Wed, 19 Dec 2018 16:04:16 -0500 Subject: [PATCH] Move language pack locations to AppData --- .../org/sleuthkit/autopsy/core/Installer.java | 25 +++++++++++++++++++ .../autopsy/coreutils/PlatformUtil.java | 10 ++++++++ .../textreaders/TikaTextExtractor.java | 15 +++-------- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/core/Installer.java b/Core/src/org/sleuthkit/autopsy/core/Installer.java index e45055b077..2f65b6099b 100644 --- a/Core/src/org/sleuthkit/autopsy/core/Installer.java +++ b/Core/src/org/sleuthkit/autopsy/core/Installer.java @@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.core; import java.awt.Cursor; import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; @@ -29,6 +30,8 @@ import java.util.logging.Handler; import java.util.logging.Level; import javafx.application.Platform; import javafx.embed.swing.JFXPanel; +import org.apache.commons.io.FileUtils; +import org.openide.modules.InstalledFileLocator; import org.openide.modules.ModuleInstall; import org.openide.util.NbBundle; import org.openide.windows.WindowManager; @@ -285,12 +288,34 @@ public class Installer extends ModuleInstall { File pythonModulesDir = new File(PlatformUtil.getUserPythonModulesPath()); pythonModulesDir.mkdir(); } + + /** + * Make a folder in the config directory for Ocr Language Packs if one does + * not exist. + */ + private static void ensureOcrLanguagePacksFolderExists() { + File ocrLanguagePacksDir = new File(PlatformUtil.getOcrLanguagePacksPath()); + boolean createDirectory = ocrLanguagePacksDir.mkdir(); + + //If the directory did not exist, copy the tessdata folder over so we + //support english. + if(createDirectory) { + File tessdataDir = InstalledFileLocator.getDefault().locate( + "Tesseract-OCR/tessdata", Installer.class.getPackage().getName(), false); + try { + FileUtils.copyDirectory(tessdataDir, ocrLanguagePacksDir); + } catch (IOException ex) { + logger.log(Level.SEVERE, "Copying over default language packs for Tesseract failed.", ex); + } + } + } @Override public void restored() { super.restored(); ensurePythonModulesFolderExists(); ensureClassifierFolderExists(); + ensureOcrLanguagePacksFolderExists(); initJavaFx(); for (ModuleInstall mi : packageInstallers) { try { diff --git a/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java b/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java index f0a3086588..a785bdf638 100644 --- a/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java +++ b/Core/src/org/sleuthkit/autopsy/coreutils/PlatformUtil.java @@ -52,6 +52,7 @@ public class PlatformUtil { private static final String PYTHON_MODULES_SUBDIRECTORY = "python_modules"; //NON-NLS private static final String CLASSIFIERS_SUBDIRECTORY = "object_detection_classifiers"; //NON-NLS + private static final String OCR_LANGUAGE_SUBDIRECTORY = "ocr_language_packs"; //NON-NLS private static String javaPath = null; public static final String OS_NAME_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.nameUnknown"); public static final String OS_VERSION_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.verUnknown"); @@ -116,6 +117,15 @@ public class PlatformUtil { public static String getUserPythonModulesPath() { return getUserDirectory().getAbsolutePath() + File.separator + PYTHON_MODULES_SUBDIRECTORY; } + + /** + * Get root path where the user's Ocr language packs are stored. + * + * @return Absolute path to the Ocr language packs root directory. + */ + public static String getOcrLanguagePacksPath() { + return getUserDirectory().getAbsolutePath() + File.separator + OCR_LANGUAGE_SUBDIRECTORY; + } /** * Get root path where the user's object detection classifiers are stored. diff --git a/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java index e93cd9c2be..192b25e88b 100644 --- a/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textreaders/TikaTextExtractor.java @@ -202,14 +202,9 @@ final class TikaTextExtractor extends TextExtractor { TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); String tesseractFolder = TESSERACT_PATH.getParent(); ocrConfig.setTesseractPath(tesseractFolder); - /* - * Tesseract expects language data packs to be in a subdirectory - * of tesseractFolder, in a folder called "tessdata". If they - * are stored somewhere else, use - * ocrConfig.setTessdataPath(String tessdataPath) to point to - * them - */ + ocrConfig.setLanguage(LANGUAGE_PACKS); + ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); parseContext.set(TesseractOCRConfig.class, ocrConfig); stream = new ReadContentInputStream(content); @@ -292,6 +287,7 @@ final class TikaTextExtractor extends TextExtractor { process.command(executeablePath, String.format("\"%s\"", inputFile.getAbsolutePath()), String.format("\"%s\"", outputFilePath), + "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(), //language pack command flag "-l", LANGUAGE_PACKS); @@ -450,10 +446,7 @@ final class TikaTextExtractor extends TextExtractor { * @return String of all language packs available for Tesseract to use */ private static String getLanguagePacks() { - File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata"); - if (!languagePackRootDir.exists()) { - return ""; - } + File languagePackRootDir = new File(PlatformUtil.getOcrLanguagePacksPath()); List languagePacks = new ArrayList<>(); for (File languagePack : languagePackRootDir.listFiles()) {