diff --git a/Core/src/org/sleuthkit/autopsy/core/UserPreferences.java b/Core/src/org/sleuthkit/autopsy/core/UserPreferences.java index 3dc28a49be..08c6f8f6d4 100644 --- a/Core/src/org/sleuthkit/autopsy/core/UserPreferences.java +++ b/Core/src/org/sleuthkit/autopsy/core/UserPreferences.java @@ -62,6 +62,7 @@ public final class UserPreferences { private static final String MESSAGE_SERVICE_HOST = "MessageServiceHost"; //NON-NLS private static final String MESSAGE_SERVICE_PORT = "MessageServicePort"; //NON-NLS public static final String TEXT_TRANSLATOR_NAME = "TextTranslatorName"; + public static final String OCR_TRANSLATION_ENABLED = "OcrTranslationEnabled"; public static final String PROCESS_TIME_OUT_ENABLED = "ProcessTimeOutEnabled"; //NON-NLS public static final String PROCESS_TIME_OUT_HOURS = "ProcessTimeOutHours"; //NON-NLS private static final int DEFAULT_PROCESS_TIMEOUT_HR = 60; @@ -347,6 +348,14 @@ public final class UserPreferences { public static String getTextTranslatorName() { return preferences.get(TEXT_TRANSLATOR_NAME, null); } + + public static void setUseOcrInTranslation(boolean enableOcr) { + preferences.putBoolean(OCR_TRANSLATION_ENABLED, enableOcr); + } + + public static boolean getUseOcrInTranslation() { + return preferences.getBoolean(OCR_TRANSLATION_ENABLED, true); + } /** * Persists message service connection info. @@ -623,4 +632,4 @@ public final class UserPreferences { return Paths.get(UserMachinePreferences.getBaseTempDirectory(), getAppName()) .toAbsolutePath().toString(); } -} +} \ No newline at end of file diff --git a/Core/src/org/sleuthkit/autopsy/corecomponents/AutoWrappingJTextPane.java b/Core/src/org/sleuthkit/autopsy/corecomponents/AutoWrappingJTextPane.java index 8b739b8dcf..1e4c1f4c63 100755 --- a/Core/src/org/sleuthkit/autopsy/corecomponents/AutoWrappingJTextPane.java +++ b/Core/src/org/sleuthkit/autopsy/corecomponents/AutoWrappingJTextPane.java @@ -27,6 +27,7 @@ import javax.swing.text.ViewFactory; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.InlineView; import javax.swing.text.html.ParagraphView; +import org.sleuthkit.autopsy.coreutils.EscapeUtil; /** * JTextPane extension that auto wraps input text using an HTMLEditorKit trick. @@ -98,6 +99,6 @@ public class AutoWrappingJTextPane extends JTextPane { @Override public void setText(String text) { - super.setText("
" + text + "
"); + super.setText("
" + EscapeUtil.escapeHtml(text) + "
"); } } diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index a8bf0591fb..ee5b8d8fc3 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2019 Basis Technology Corp. + * Copyright 2011-2020 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,7 +46,6 @@ import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParsingReader; @@ -72,6 +71,9 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import com.google.common.collect.ImmutableMap; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY; /** * Extracts text from Tika supported content. Protects against Tika parser hangs @@ -126,16 +128,6 @@ final class TikaTextExtractor implements TextExtractor { "application/x-z", //NON-NLS "application/x-compress"); //NON-NLS - //Tika should ignore types with embedded files that can be handled by the unpacking modules - private static final List EMBEDDED_FILE_MIME_TYPES - = ImmutableList.of("application/msword", //NON-NLS - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS - "application/vnd.ms-powerpoint", //NON-NLS - "application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS - "application/vnd.ms-excel", //NON-NLS - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS - "application/pdf"); //NON-NLS - // Used to log to the tika file that is why it uses the java.util.logging.logger class instead of the Autopsy one private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName()); @@ -193,52 +185,31 @@ final class TikaTextExtractor implements TextExtractor { */ @Override public Reader getReader() throws InitReaderException { - InputStream stream = null; - - ParseContext parseContext = new ParseContext(); - - //Disable appending embedded file text to output for EFE supported types - //JIRA-4975 - if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) { - parseContext.set(Parser.class, new EmptyParser()); - } else { - parseContext.set(Parser.class, parser); + if (!this.isSupported()) { + throw new InitReaderException("Content is not supported"); } - if (ocrEnabled() && content instanceof AbstractFile) { - AbstractFile file = ((AbstractFile) content); - //Run OCR on images with Tesseract directly. - if (file.getMIMEType().toLowerCase().startsWith("image/")) { - stream = performOCR(file); - } else { - //Otherwise, go through Tika for PDFs so that it can - //extract images and run Tesseract on them. - PDFParserConfig pdfConfig = new PDFParserConfig(); + // Only abstract files are supported, see isSupported() + final AbstractFile file = ((AbstractFile) content); + // This mime type must be non-null, see isSupported() + final String mimeType = file.getMIMEType(); - // Extracting the inline images and letting Tesseract run on each inline image. - // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29 - // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html - pdfConfig.setExtractInlineImages(true); - // Multiple pages within a PDF file might refer to the same underlying image. - pdfConfig.setExtractUniqueInlineImagesOnly(true); - parseContext.set(PDFParserConfig.class, pdfConfig); - - // Configure Tesseract parser to perform OCR - TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); - String tesseractFolder = TESSERACT_PATH.getParent(); - ocrConfig.setTesseractPath(tesseractFolder); - - ocrConfig.setLanguage(languagePacks); - ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); - parseContext.set(TesseractOCRConfig.class, ocrConfig); - - stream = new ReadContentInputStream(content); - } - } else { - stream = new ReadContentInputStream(content); + // Handle images seperately so the OCR task can be cancelled. + // See JIRA-4519 for the need to have cancellation in the UI and ingest. + if (ocrEnabled() && mimeType.toLowerCase().startsWith("image/")) { + InputStream imageOcrStream = performOCR(file); + return new InputStreamReader(imageOcrStream, Charset.forName("UTF-8")); } - Metadata metadata = new Metadata(); + // Set up Tika + final InputStream stream = new ReadContentInputStream(content); + final ParseContext parseContext = new ParseContext(); + + // Documents can contain other documents. By adding + // the parser back into the context, Tika will recursively + // parse embedded documents. + parseContext.set(Parser.class, parser); + // Use the more memory efficient Tika SAX parsers for DOCX and // PPTX files (it already uses SAX for XLSX). OfficeParserConfig officeParserConfig = new OfficeParserConfig(); @@ -246,6 +217,30 @@ final class TikaTextExtractor implements TextExtractor { officeParserConfig.setUseSAXDocxExtractor(true); parseContext.set(OfficeParserConfig.class, officeParserConfig); + if (ocrEnabled()) { + // Configure OCR for Tika if it chooses to run OCR + // during extraction + TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); + String tesseractFolder = TESSERACT_PATH.getParent(); + ocrConfig.setTesseractPath(tesseractFolder); + ocrConfig.setLanguage(languagePacks); + ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath()); + parseContext.set(TesseractOCRConfig.class, ocrConfig); + + // Configure how Tika handles OCRing PDFs + PDFParserConfig pdfConfig = new PDFParserConfig(); + + // This stategy tries to pick between OCRing a page in the + // PDF and doing text extraction. It makes this choice by + // first running text extraction and then counting characters. + // If there are too few characters or too many unmapped + // unicode characters, it'll run the entire page through OCR + // and take that output instead. See JIRA-6938 + pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO); + parseContext.set(PDFParserConfig.class, pdfConfig); + } + + Metadata metadata = new Metadata(); //Make the creation of a TikaReader a cancellable future in case it takes too long Future future = executorService.submit( new GetTikaReader(parser, stream, metadata, parseContext)); @@ -568,4 +563,4 @@ final class TikaTextExtractor implements TextExtractor { return reader; } } -} +} \ No newline at end of file diff --git a/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties b/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties index 5fcf80851c..bd321cf492 100755 --- a/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties +++ b/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties @@ -6,3 +6,4 @@ TranslationOptionsPanelController.moduleErr.msg=A module caused an error listeni TranslationContentPanel.showLabel.text=Show: TranslationOptionsPanel.translationServiceLabel.text=Text translator: TranslationOptionsPanel.translationOptionsDescription.text=Configure a 3rd party text translation service to enable text and file name translation. +TranslationOptionsPanel.enableOcrCheckBox.text=Enable Optical Character Recognition (OCR) in the translation content viewer diff --git a/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties-MERGED b/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties-MERGED index 665c07c36f..2da3aab4a4 100755 --- a/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties-MERGED +++ b/Core/src/org/sleuthkit/autopsy/texttranslation/Bundle.properties-MERGED @@ -10,3 +10,4 @@ TranslationOptionsPanelController.moduleErr.msg=A module caused an error listeni TranslationContentPanel.showLabel.text=Show: TranslationOptionsPanel.translationServiceLabel.text=Text translator: TranslationOptionsPanel.translationOptionsDescription.text=Configure a 3rd party text translation service to enable text and file name translation. +TranslationOptionsPanel.enableOcrCheckBox.text=Enable Optical Character Recognition (OCR) in the translation content viewer diff --git a/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.form b/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.form index 52cd0fb65c..aadca0381f 100644 --- a/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.form +++ b/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.form @@ -16,17 +16,23 @@ + + - - - + + + + + + + + - @@ -42,9 +48,13 @@ - - - + + + + + + + @@ -76,5 +86,17 @@ + + + + + + + + + + + + - + \ No newline at end of file diff --git a/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.java b/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.java index 3c9ee7dc7b..9fdb95b2b1 100644 --- a/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.java +++ b/Core/src/org/sleuthkit/autopsy/texttranslation/TranslationOptionsPanel.java @@ -111,6 +111,7 @@ final class TranslationOptionsPanel extends javax.swing.JPanel { } translatorComboBox.setSelectedItem(currentSelection); loadSelectedPanelSettings(); + enableOcrCheckBox.setSelected(UserPreferences.getUseOcrInTranslation()); } /** @@ -128,6 +129,8 @@ final class TranslationOptionsPanel extends javax.swing.JPanel { logger.log(Level.WARNING, "Unable to save settings for TextTranslator named: " + currentSelection, ex); } } + // Save whether OCR is enabled in the content viewer + UserPreferences.setUseOcrInTranslation(enableOcrCheckBox.isSelected()); } @@ -144,6 +147,8 @@ final class TranslationOptionsPanel extends javax.swing.JPanel { translationServiceLabel = new javax.swing.JLabel(); translationServicePanel = new javax.swing.JPanel(); translationOptionsDescription = new javax.swing.JLabel(); + jSeparator1 = new javax.swing.JSeparator(); + enableOcrCheckBox = new javax.swing.JCheckBox(); translatorComboBox.addActionListener(new java.awt.event.ActionListener() { public void actionPerformed(java.awt.event.ActionEvent evt) { @@ -157,20 +162,31 @@ final class TranslationOptionsPanel extends javax.swing.JPanel { org.openide.awt.Mnemonics.setLocalizedText(translationOptionsDescription, org.openide.util.NbBundle.getMessage(TranslationOptionsPanel.class, "TranslationOptionsPanel.translationOptionsDescription.text")); // NOI18N + org.openide.awt.Mnemonics.setLocalizedText(enableOcrCheckBox, org.openide.util.NbBundle.getMessage(TranslationOptionsPanel.class, "TranslationOptionsPanel.enableOcrCheckBox.text")); // NOI18N + enableOcrCheckBox.addActionListener(new java.awt.event.ActionListener() { + public void actionPerformed(java.awt.event.ActionEvent evt) { + enableOcrCheckBoxActionPerformed(evt); + } + }); + javax.swing.GroupLayout layout = new javax.swing.GroupLayout(this); this.setLayout(layout); layout.setHorizontalGroup( layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING) + .addComponent(jSeparator1) .addGroup(layout.createSequentialGroup() .addContainerGap() .addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING) .addComponent(translationServicePanel, javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.DEFAULT_SIZE, Short.MAX_VALUE) + .addComponent(translationOptionsDescription, javax.swing.GroupLayout.DEFAULT_SIZE, 462, Short.MAX_VALUE) .addGroup(layout.createSequentialGroup() - .addComponent(translationServiceLabel) - .addGap(10, 10, 10) - .addComponent(translatorComboBox, javax.swing.GroupLayout.PREFERRED_SIZE, 214, javax.swing.GroupLayout.PREFERRED_SIZE) - .addGap(0, 0, Short.MAX_VALUE)) - .addComponent(translationOptionsDescription, javax.swing.GroupLayout.PREFERRED_SIZE, 462, Short.MAX_VALUE)) + .addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING) + .addGroup(layout.createSequentialGroup() + .addComponent(translationServiceLabel) + .addGap(10, 10, 10) + .addComponent(translatorComboBox, javax.swing.GroupLayout.PREFERRED_SIZE, 214, javax.swing.GroupLayout.PREFERRED_SIZE)) + .addComponent(enableOcrCheckBox)) + .addGap(0, 0, Short.MAX_VALUE))) .addContainerGap()) ); layout.setVerticalGroup( @@ -183,8 +199,12 @@ final class TranslationOptionsPanel extends javax.swing.JPanel { .addComponent(translatorComboBox, javax.swing.GroupLayout.PREFERRED_SIZE, javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.PREFERRED_SIZE) .addComponent(translationServiceLabel)) .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED) - .addComponent(translationServicePanel, javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.DEFAULT_SIZE, Short.MAX_VALUE) - .addContainerGap()) + .addComponent(translationServicePanel, javax.swing.GroupLayout.PREFERRED_SIZE, javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.PREFERRED_SIZE) + .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.UNRELATED) + .addComponent(jSeparator1, javax.swing.GroupLayout.PREFERRED_SIZE, 10, javax.swing.GroupLayout.PREFERRED_SIZE) + .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED) + .addComponent(enableOcrCheckBox) + .addContainerGap(javax.swing.GroupLayout.DEFAULT_SIZE, Short.MAX_VALUE)) ); }// //GEN-END:initComponents @@ -192,12 +212,18 @@ final class TranslationOptionsPanel extends javax.swing.JPanel { updatePanel(); }//GEN-LAST:event_translatorComboBoxActionPerformed + private void enableOcrCheckBoxActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_enableOcrCheckBoxActionPerformed + controller.changed(); + }//GEN-LAST:event_enableOcrCheckBoxActionPerformed + // Variables declaration - do not modify//GEN-BEGIN:variables + private javax.swing.JCheckBox enableOcrCheckBox; + private javax.swing.JSeparator jSeparator1; private javax.swing.JLabel translationOptionsDescription; private javax.swing.JLabel translationServiceLabel; private javax.swing.JPanel translationServicePanel; private javax.swing.JComboBox translatorComboBox; // End of variables declaration//GEN-END:variables -} +} \ No newline at end of file diff --git a/Core/src/org/sleuthkit/autopsy/texttranslation/ui/Bundle.properties-MERGED b/Core/src/org/sleuthkit/autopsy/texttranslation/ui/Bundle.properties-MERGED index 802b117c35..c87d7ae238 100644 --- a/Core/src/org/sleuthkit/autopsy/texttranslation/ui/Bundle.properties-MERGED +++ b/Core/src/org/sleuthkit/autopsy/texttranslation/ui/Bundle.properties-MERGED @@ -8,6 +8,7 @@ TranslatedContentViewer.errorExtractingText=An error occurred while extracting t TranslatedContentViewer.extractingText=Extracting text, please wait... TranslatedContentViewer.fileHasNoText=File has no text. TranslatedContentViewer.noServiceProvider=The machine translation software was not found. +TranslatedContentViewer.ocrNotEnabled=OCR is not enabled. To change, go to Tools->Options->Machine Translation TranslatedContentViewer.translatingText=Translating text, please wait... # {0} - exception message TranslatedContentViewer.translationException=An error occurred while translating the text ({0}). diff --git a/Core/src/org/sleuthkit/autopsy/texttranslation/ui/TranslatedTextViewer.java b/Core/src/org/sleuthkit/autopsy/texttranslation/ui/TranslatedTextViewer.java index ab0b9de083..6b04427e92 100644 --- a/Core/src/org/sleuthkit/autopsy/texttranslation/ui/TranslatedTextViewer.java +++ b/Core/src/org/sleuthkit/autopsy/texttranslation/ui/TranslatedTextViewer.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2019 Basis Technology Corp. + * Copyright 2020 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -38,16 +38,15 @@ import org.sleuthkit.datamodel.AbstractFile; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.lookup.Lookups; -import org.sleuthkit.autopsy.corecomponents.DataContentViewerUtility; import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator; import org.sleuthkit.autopsy.textextractors.TextExtractor; import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; import org.sleuthkit.autopsy.textextractors.configs.ImageConfig; import org.sleuthkit.autopsy.texttranslation.TextTranslationService; -import org.sleuthkit.datamodel.Content; import java.util.List; import java.util.logging.Level; import javax.swing.SwingUtilities; +import org.sleuthkit.autopsy.core.UserPreferences; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.PlatformUtil; import org.sleuthkit.autopsy.texttranslation.ui.TranslationContentPanel.DisplayDropdownOptions; @@ -60,8 +59,6 @@ public final class TranslatedTextViewer implements TextViewer { private static final Logger logger = Logger.getLogger(TranslatedTextViewer.class.getName()); - private static final boolean OCR_ENABLED = true; - private static final boolean OCR_DISABLED = false; private static final int MAX_EXTRACT_SIZE_BYTES = 25600; private static final List INSTALLED_LANGUAGE_PACKS = PlatformUtil.getOcrLanguagePacks(); private final TranslationContentPanel panel = new TranslationContentPanel(); @@ -81,15 +78,10 @@ public final class TranslatedTextViewer implements TextViewer { SelectionChangeListener displayDropDownListener = new DisplayDropDownChangeListener(); panel.addDisplayTextActionListener(displayDropDownListener); panel.addOcrDropDownActionListener(new OCRDropdownChangeListener()); - Content source = DataContentViewerUtility.getDefaultContent(node); - - if (source instanceof AbstractFile) { - boolean isImage = ((AbstractFile) source).getMIMEType().toLowerCase().startsWith("image/"); - if (isImage) { - panel.enableOCRSelection(OCR_ENABLED); - panel.addLanguagePackNames(INSTALLED_LANGUAGE_PACKS); - } + if (UserPreferences.getUseOcrInTranslation()) { + panel.addLanguagePackNames(INSTALLED_LANGUAGE_PACKS); } + panel.enableOCRSelection(UserPreferences.getUseOcrInTranslation()); int payloadMaxInKB = TextTranslationService.getInstance().getMaxTextChars() / 1000; panel.setWarningLabelMsg(String.format(Bundle.TranslatedTextViewer_maxPayloadSize(), payloadMaxInKB)); @@ -201,14 +193,16 @@ public final class TranslatedTextViewer implements TextViewer { * @throws * org.sleuthkit.autopsy.textextractors.TextExtractor.InitReaderException */ + @NbBundle.Messages({ + "TranslatedContentViewer.ocrNotEnabled=OCR is not enabled. To change, go to Tools->Options->Machine Translation", + }) private String getFileText(AbstractFile file) throws IOException, InterruptedException, TextExtractor.InitReaderException { final boolean isImage = file.getMIMEType().toLowerCase().startsWith("image/"); // NON-NLS - String result; - if (isImage) { - result = extractText(file, OCR_ENABLED); - } else { - result = extractText(file, OCR_DISABLED); + if (isImage && ! UserPreferences.getUseOcrInTranslation()) { + return Bundle.TranslatedContentViewer_ocrNotEnabled(); } + + String result = extractText(file, UserPreferences.getUseOcrInTranslation()); //Correct for UTF-8 byte[] resultInUTF8Bytes = result.getBytes("UTF8"); @@ -363,4 +357,4 @@ public final class TranslatedTextViewer implements TextViewer { return panel.getSelectedOcrLanguagePack(); } } -} +} \ No newline at end of file diff --git a/docs/doxygen-user/images/mt_config.png b/docs/doxygen-user/images/mt_config.png index e3022aec0d..b793191ae8 100644 Binary files a/docs/doxygen-user/images/mt_config.png and b/docs/doxygen-user/images/mt_config.png differ diff --git a/docs/doxygen-user/images/mt_ocr_image.png b/docs/doxygen-user/images/mt_ocr_image.png new file mode 100644 index 0000000000..b4e16bfa09 Binary files /dev/null and b/docs/doxygen-user/images/mt_ocr_image.png differ diff --git a/docs/doxygen-user/images/mt_ocr_result.png b/docs/doxygen-user/images/mt_ocr_result.png new file mode 100644 index 0000000000..27d9af77f0 Binary files /dev/null and b/docs/doxygen-user/images/mt_ocr_result.png differ diff --git a/docs/doxygen-user/keyword_search.dox b/docs/doxygen-user/keyword_search.dox index 271e660cac..fd207a6de2 100644 --- a/docs/doxygen-user/keyword_search.dox +++ b/docs/doxygen-user/keyword_search.dox @@ -57,6 +57,7 @@ The "Indexed Text" tab shows the results when running the keyword search module \image html keyword-search-ocr-indexed-text.png +\anchor keyword_search_ocr_config By default, OCR is only configured for English text. Its configuration depends on the presence of language files (called "traineddata" files) that exist in a location that Autopsy can understand. To add support for more languages, you will need to download additional "traineddata" and move them to the right location. The following steps breakdown this process for you: diff --git a/docs/doxygen-user/machine_translation.dox b/docs/doxygen-user/machine_translation.dox index cec795d932..4609c81257 100644 --- a/docs/doxygen-user/machine_translation.dox +++ b/docs/doxygen-user/machine_translation.dox @@ -17,6 +17,8 @@ To set up a machine translation service, go to Options->Tools and then select th Each service will require slightly different configuration steps. After setting everything up, you can run a quick check that the service is set up correctly using the "Test" button. +The checkbox at the bottom allows you to enable or disable optical character recognition (OCR). When enabled, if you select an image in the \ref mt_content_viewer "content viewer" Autopsy will use OCR to attempt to extract text to be translated. Instructions for installing OCR packages for different languages can be found on the \ref keyword_search_ocr_config "Keyword Search page". + \section mt_file_names Translating File Names You can use machine translation to automatically translate file and folder names, such as the ones seen below: @@ -49,4 +51,12 @@ Then use the drop-down menu on the right to change from "Original Text" to "Tran \image html mt_message_translated.png +If you've enabled OCR as described in the \ref mt_config section above, you can extract and translate text from images. Here is an image containing the beginning of a French poem: + +\image html mt_ocr_image.png + +If you go to the Text tab and then the Translation viewer it will use OCR to read text from the image and then display the translation. + +\image html mt_ocr_result.png + */ \ No newline at end of file