mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-16 09:47:42 +00:00
Merge pull request #4478 from dannysmyda/4656-allow-user-to-pick-language
4656 - Add OCR language pack configuration
This commit is contained in:
commit
63bb7f805e
@ -36,6 +36,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import javax.swing.filechooser.FileSystemView;
|
import javax.swing.filechooser.FileSystemView;
|
||||||
|
import org.apache.commons.io.FilenameUtils;
|
||||||
import org.hyperic.sigar.Sigar;
|
import org.hyperic.sigar.Sigar;
|
||||||
import org.hyperic.sigar.ptql.ProcessFinder;
|
import org.hyperic.sigar.ptql.ProcessFinder;
|
||||||
import org.openide.modules.InstalledFileLocator;
|
import org.openide.modules.InstalledFileLocator;
|
||||||
@ -53,6 +54,7 @@ public class PlatformUtil {
|
|||||||
private static final String PYTHON_MODULES_SUBDIRECTORY = "python_modules"; //NON-NLS
|
private static final String PYTHON_MODULES_SUBDIRECTORY = "python_modules"; //NON-NLS
|
||||||
private static final String CLASSIFIERS_SUBDIRECTORY = "object_detection_classifiers"; //NON-NLS
|
private static final String CLASSIFIERS_SUBDIRECTORY = "object_detection_classifiers"; //NON-NLS
|
||||||
private static final String OCR_LANGUAGE_SUBDIRECTORY = "ocr_language_packs"; //NON-NLS
|
private static final String OCR_LANGUAGE_SUBDIRECTORY = "ocr_language_packs"; //NON-NLS
|
||||||
|
private static final String OCR_LANGUAGE_PACK_EXT = "traineddata";
|
||||||
private static String javaPath = null;
|
private static String javaPath = null;
|
||||||
public static final String OS_NAME_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.nameUnknown");
|
public static final String OS_NAME_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.nameUnknown");
|
||||||
public static final String OS_VERSION_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.verUnknown");
|
public static final String OS_VERSION_UNKNOWN = NbBundle.getMessage(PlatformUtil.class, "PlatformUtil.verUnknown");
|
||||||
@ -127,6 +129,26 @@ public class PlatformUtil {
|
|||||||
return getUserDirectory().getAbsolutePath() + File.separator + OCR_LANGUAGE_SUBDIRECTORY;
|
return getUserDirectory().getAbsolutePath() + File.separator + OCR_LANGUAGE_SUBDIRECTORY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the names of the language packs installed at the user directory.
|
||||||
|
*
|
||||||
|
* @return List of language packs base names
|
||||||
|
*/
|
||||||
|
public static List<String> getOcrLanguagePacks() {
|
||||||
|
File languagePackRootDir = new File(getOcrLanguagePacksPath());
|
||||||
|
|
||||||
|
List<String> languagePacks = new ArrayList<>();
|
||||||
|
for (File languagePack : languagePackRootDir.listFiles()) {
|
||||||
|
String fileExt = FilenameUtils.getExtension(languagePack.getName());
|
||||||
|
if (!languagePack.isDirectory() && OCR_LANGUAGE_PACK_EXT.equals(fileExt)) {
|
||||||
|
String packageName = FilenameUtils.getBaseName(languagePack.getName());
|
||||||
|
languagePacks.add(packageName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return languagePacks;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get root path where the user's object detection classifiers are stored.
|
* Get root path where the user's object detection classifiers are stored.
|
||||||
*
|
*
|
||||||
|
@ -29,7 +29,6 @@ import java.io.InputStream;
|
|||||||
import java.io.PushbackReader;
|
import java.io.PushbackReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.concurrent.Callable;
|
import java.util.concurrent.Callable;
|
||||||
@ -42,7 +41,6 @@ import java.util.concurrent.TimeoutException;
|
|||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import org.apache.commons.io.FilenameUtils;
|
|
||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.parser.AutoDetectParser;
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
@ -134,8 +132,7 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
|
private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
|
||||||
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
|
private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
|
||||||
private static final File TESSERACT_PATH = locateTesseractExecutable();
|
private static final File TESSERACT_PATH = locateTesseractExecutable();
|
||||||
private static final String LANGUAGE_PACKS = getLanguagePacks();
|
private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
|
||||||
private static final String TESSERACT_LANGUAGE_PACK_EXT = "traineddata"; //NON-NLS
|
|
||||||
private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
|
private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
|
||||||
|
|
||||||
private ProcessTerminator processTerminator;
|
private ProcessTerminator processTerminator;
|
||||||
@ -203,7 +200,7 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
String tesseractFolder = TESSERACT_PATH.getParent();
|
String tesseractFolder = TESSERACT_PATH.getParent();
|
||||||
ocrConfig.setTesseractPath(tesseractFolder);
|
ocrConfig.setTesseractPath(tesseractFolder);
|
||||||
|
|
||||||
ocrConfig.setLanguage(LANGUAGE_PACKS);
|
ocrConfig.setLanguage(languagePacks);
|
||||||
ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
|
ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
|
||||||
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
||||||
|
|
||||||
@ -289,7 +286,7 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
String.format("\"%s\"", outputFilePath),
|
String.format("\"%s\"", outputFilePath),
|
||||||
"--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
|
"--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
|
||||||
//language pack command flag
|
//language pack command flag
|
||||||
"-l", LANGUAGE_PACKS);
|
"-l", languagePacks);
|
||||||
|
|
||||||
//If the ProcessTerminator was supplied during
|
//If the ProcessTerminator was supplied during
|
||||||
//configuration apply it here.
|
//configuration apply it here.
|
||||||
@ -443,23 +440,11 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves all of the installed language packs from their designated
|
* Formats language packs to be parseable from the command line.
|
||||||
* directory location to be used to configure Tesseract OCR.
|
|
||||||
*
|
*
|
||||||
* @return String of all language packs available for Tesseract to use
|
* @return String of all language packs available for Tesseract to use
|
||||||
*/
|
*/
|
||||||
private static String getLanguagePacks() {
|
private static String formatLanguagePacks(List<String> languagePacks) {
|
||||||
File languagePackRootDir = new File(PlatformUtil.getOcrLanguagePacksPath());
|
|
||||||
|
|
||||||
List<String> languagePacks = new ArrayList<>();
|
|
||||||
for (File languagePack : languagePackRootDir.listFiles()) {
|
|
||||||
String fileExt = FilenameUtils.getExtension(languagePack.getName());
|
|
||||||
if (!languagePack.isDirectory() && TESSERACT_LANGUAGE_PACK_EXT.equals(fileExt)) {
|
|
||||||
String packageName = FilenameUtils.getBaseName(languagePack.getName());
|
|
||||||
languagePacks.add(packageName);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return String.join("+", languagePacks);
|
return String.join("+", languagePacks);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -499,10 +484,16 @@ final class TikaTextExtractor implements TextExtractor {
|
|||||||
public void setExtractionSettings(Lookup context) {
|
public void setExtractionSettings(Lookup context) {
|
||||||
if (context != null) {
|
if (context != null) {
|
||||||
ImageConfig configInstance = context.lookup(ImageConfig.class);
|
ImageConfig configInstance = context.lookup(ImageConfig.class);
|
||||||
if (configInstance != null && Objects.nonNull(configInstance.getOCREnabled())) {
|
if (configInstance != null) {
|
||||||
|
if(Objects.nonNull(configInstance.getOCREnabled())) {
|
||||||
this.tesseractOCREnabled = configInstance.getOCREnabled();
|
this.tesseractOCREnabled = configInstance.getOCREnabled();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(Objects.nonNull(configInstance.getOCRLanguages())) {
|
||||||
|
this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
|
ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
|
||||||
if (terminatorInstance != null) {
|
if (terminatorInstance != null) {
|
||||||
this.processTerminator = terminatorInstance;
|
this.processTerminator = terminatorInstance;
|
||||||
|
@ -18,19 +18,22 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.textextractors.configs;
|
package org.sleuthkit.autopsy.textextractors.configs;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allows for configuration of OCR on image files. Extractors that use ImageConfig
|
* Allows for configuration of OCR on image files. Extractors that use
|
||||||
* can be obtained through TextExtractoryFactory.getExtractor().
|
* ImageConfig can be obtained through TextExtractoryFactory.getExtractor().
|
||||||
*
|
*
|
||||||
* @see org.openide.util.Lookup
|
* @see org.openide.util.Lookup
|
||||||
*/
|
*/
|
||||||
public class ImageConfig {
|
public class ImageConfig {
|
||||||
|
|
||||||
private Boolean OCREnabled;
|
private Boolean OCREnabled;
|
||||||
|
private List<String> ocrLanguages;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enables OCR to be run on the text reader responsible for handling
|
* Enables OCR to be run on the text reader responsible for handling image
|
||||||
* image files.
|
* files.
|
||||||
*
|
*
|
||||||
* @param enabled Flag indicating if OCR is enabled.
|
* @param enabled Flag indicating if OCR is enabled.
|
||||||
*/
|
*/
|
||||||
@ -46,4 +49,24 @@ public class ImageConfig {
|
|||||||
public boolean getOCREnabled() {
|
public boolean getOCREnabled() {
|
||||||
return this.OCREnabled;
|
return this.OCREnabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets languages for OCR.
|
||||||
|
*
|
||||||
|
* See PlatformUtil for list of installed language packs.
|
||||||
|
*
|
||||||
|
* @param languages List of languages to use
|
||||||
|
*/
|
||||||
|
public void setOCRLanguages(List<String> languages) {
|
||||||
|
this.ocrLanguages = languages;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the list of languages OCR should perform.
|
||||||
|
*
|
||||||
|
* @return Collection of OCR languages
|
||||||
|
*/
|
||||||
|
public List<String> getOCRLanguages() {
|
||||||
|
return this.ocrLanguages;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user