From 839c766d9a8a563b645810d8485abea000f781f6 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Thu, 13 Dec 2018 14:05:56 -0500
Subject: [PATCH 1/4] Pulled the Tesseract use out of Tika and allow for them
 to be cancelled

---
 .../textextractors/TikaTextExtractor.java     | 190 +++++++++++++++---
 1 file changed, 163 insertions(+), 27 deletions(-)
diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 9b766a9e9e..fbd2150ff4 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -21,7 +21,10 @@ package org.sleuthkit.autopsy.textextractors;
 import com.google.common.collect.ImmutableList;
 import com.google.common.io.CharSource;
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.PushbackReader;
 import java.io.Reader;
 import java.nio.file.Paths;
@@ -29,6 +32,7 @@ import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Objects;
+import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
@@ -50,8 +54,14 @@ import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.openide.util.NbBundle;
 import org.openide.modules.InstalledFileLocator;
 import org.openide.util.Lookup;
+import org.sleuthkit.autopsy.casemodule.Case;
+import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
+import org.sleuthkit.autopsy.coreutils.ExecUtil;
+import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
 import org.sleuthkit.autopsy.coreutils.PlatformUtil;
+import org.sleuthkit.autopsy.datamodel.ContentUtils;
 import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig;
+import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 
@@ -121,6 +131,7 @@ final class TikaTextExtractor extends TextExtractor {
     private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
     private static final File TESSERACT_PATH = locateTesseractExecutable();
     private static final String LANGUAGE_PACKS = getLanguagePacks();
+    private ProcessTerminator processTerminator;
 
     private static final List<String> TIKA_SUPPORTED_TYPES
             = new Tika().getParser().getSupportedTypes(new ParseContext())
@@ -145,7 +156,7 @@ final class TikaTextExtractor extends TextExtractor {
      */
     @Override
     public Reader getReader() throws ExtractionException {
-        ReadContentInputStream stream = new ReadContentInputStream(content);
+        InputStream stream = new ReadContentInputStream(content);
 
         Metadata metadata = new Metadata();
         ParseContext parseContext = new ParseContext();
@@ -158,35 +169,44 @@ final class TikaTextExtractor extends TextExtractor {
         officeParserConfig.setUseSAXDocxExtractor(true);
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
-        // configure OCR if it is enabled in KWS settings and installed on the machine
+        //If Tesseract has been and installed and is set to be used....
         if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
+            if (content instanceof AbstractFile) {
+                AbstractFile file = ((AbstractFile) content);
+                //Run OCR on images with Tesseract directly. 
+                //Reassign the stream we will send to Tika to point to the
+                //output file produced by Tesseract.
+                if (file.getMIMEType().toLowerCase().contains("image")) {
+                    stream = runOcrAndGetOutputStream(file);
+                } else {
+                    //Otherwise, go through Tika for PDFs so that it can
+                    //extract images and run Tesseract on them.     
+                    PDFParserConfig pdfConfig = new PDFParserConfig();
 
-            // configure PDFParser. 
-            PDFParserConfig pdfConfig = new PDFParserConfig();
+                    // Extracting the inline images and letting Tesseract run on each inline image.
+                    // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
+                    // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
+                    pdfConfig.setExtractInlineImages(true);
+                    // Multiple pages within a PDF file might refer to the same underlying image.
+                    pdfConfig.setExtractUniqueInlineImagesOnly(true);
+                    parseContext.set(PDFParserConfig.class, pdfConfig);
 
-            // Extracting the inline images and letting Tesseract run on each inline image.
-            // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
-            // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
-            pdfConfig.setExtractInlineImages(true);
-            // Multiple pages within a PDF file might refer to the same underlying image.
-            pdfConfig.setExtractUniqueInlineImagesOnly(true);
-            parseContext.set(PDFParserConfig.class, pdfConfig);
-
-            // Configure Tesseract parser to perform OCR
-            TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
-            String tesseractFolder = TESSERACT_PATH.getParent();
-            ocrConfig.setTesseractPath(tesseractFolder);
-            // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
-            // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
-            ocrConfig.setLanguage(LANGUAGE_PACKS);
-            parseContext.set(TesseractOCRConfig.class, ocrConfig);
+                    // Configure Tesseract parser to perform OCR
+                    TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+                    String tesseractFolder = TESSERACT_PATH.getParent();
+                    ocrConfig.setTesseractPath(tesseractFolder);
+                    // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
+                    // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
+                    ocrConfig.setLanguage(LANGUAGE_PACKS);
+                    parseContext.set(TesseractOCRConfig.class, ocrConfig);
+                }
+            }
         }
 
-        //Parse the file in a task, a convenient way to have a timeout...
-        final Future<Reader> future = tikaParseExecutor.submit(() -> new ParsingReader(parser, stream, metadata, parseContext));
+        //Make the creation of a TikaReader a cancellable future in case it takes too long
+        Future<Reader> future = tikaParseExecutor.submit(new GetTikaReader(parser, stream, metadata, parseContext));
         try {
             final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
-
             //check if the reader is empty
             PushbackReader pushbackReader = new PushbackReader(tikaReader);
             int read = pushbackReader.read();
@@ -212,6 +232,119 @@ final class TikaTextExtractor extends TextExtractor {
         }
     }
 
+    /**
+     * Run OCR and return the file stream produced by Tesseract.
+     *
+     * @param file Image file to run OCR on
+     *
+     * @return InputStream connected to the output file that Tesseract produced.
+     *
+     * @throws
+     * org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException
+     */
+    private InputStream runOcrAndGetOutputStream(AbstractFile file) throws ExtractionException {
+        File inputFile = null;
+        File outputFile = null;
+        try {
+            //Write file to temp directory
+            String localDiskPath = Case.getCurrentCaseThrows().getTempDirectory()
+                    + File.separator + file.getId() + file.getName();
+            inputFile = new File(localDiskPath);
+            ContentUtils.writeToFile(content, inputFile);
+
+            //Build tesseract commands
+            ProcessBuilder process = new ProcessBuilder();
+            String outputFilePath = Case.getCurrentCaseThrows().getTempDirectory()
+                    + File.separator + file.getId() + "output";
+
+            String executeablePath = TESSERACT_PATH.toString();
+            process.command(executeablePath,
+                    //Source image path
+                    String.format("\"%s\"", inputFile.getAbsolutePath()),
+                    //Output path
+                    String.format("\"%s\"", outputFilePath),
+                    //language pack command flag
+                    "-l",
+                    LANGUAGE_PACKS);
+
+            //If the ProcessTerminator was supplied during 
+            //configuration apply it here.
+            if (processTerminator != null) {
+                ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
+            } else {
+                ExecUtil.execute(process);
+            }
+
+            //Open an input stream on the output file to send to tika.
+            //Tesseract spits out a .txt file
+            outputFile = new File(outputFilePath + ".txt");
+            //When CleanUpStream is closed, it automatically 
+            //deletes the outputFile in the temp directory.
+            return new CleanUpStream(outputFile);
+        } catch (NoCurrentCaseException | IOException ex) {
+            if (outputFile != null) {
+                outputFile.delete();
+            }
+            throw new ExtractionException("Could not successfully run Tesseract", ex);
+        } finally {
+            if (inputFile != null) {
+                inputFile.delete();
+            }
+        }
+    }
+
+    /**
+     * Wraps the creation of a TikaReader into a Future so that it can be
+     * cancelled.
+     */
+    private class GetTikaReader implements Callable<Reader> {
+
+        private final AutoDetectParser parser;
+        private final InputStream stream;
+        private final Metadata metadata;
+        private final ParseContext parseContext;
+
+        public GetTikaReader(AutoDetectParser parser, InputStream stream,
+                Metadata metadata, ParseContext parseContext) {
+            this.parser = parser;
+            this.stream = stream;
+            this.metadata = metadata;
+            this.parseContext = parseContext;
+        }
+
+        @Override
+        public Reader call() throws Exception {
+            return new ParsingReader(parser, stream, metadata, parseContext);
+        }
+    }
+
+    /**
+     * Automatically deletes the underlying File when the close() method is
+     * called. This is used to delete the Output file produced from Tesseract
+     * once it has been read by Tika.
+     */
+    private class CleanUpStream extends FileInputStream {
+
+        private File file;
+
+        public CleanUpStream(File file) throws FileNotFoundException {
+            super(file);
+            this.file = file;
+        }
+
+        @Override
+        public void close() throws IOException {
+            try {
+                super.close();
+            } finally {
+                if (file != null) {
+                    file.delete();
+                    file = null;
+                }
+            }
+        }
+    }
+
     /**
      * Finds and returns the path to the Tesseract executable, if able.
      *
@@ -339,12 +472,15 @@ final class TikaTextExtractor extends TextExtractor {
     public void setExtractionSettings(Lookup context) {
         if (context != null) {
             ImageFileExtractionConfig configInstance = context.lookup(ImageFileExtractionConfig.class);
-            if (configInstance == null) {
-                return;
-            }
-            if (Objects.nonNull(configInstance.getOCREnabled())) {
+
+            if (configInstance != null && Objects.nonNull(configInstance.getOCREnabled())) {
                 this.tesseractOCREnabled = configInstance.getOCREnabled();
             }
+
+            ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
+            if (terminatorInstance != null) {
+                this.processTerminator = terminatorInstance;
+            }
         }
     }
 

From a965b50b1678e8f63b13b79fa1f94ed69634d819 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 14 Dec 2018 08:18:19 -0500
Subject: [PATCH 2/4] Did code review suggestions and fixed Codacy stuff

---
 .../textextractors/TikaTextExtractor.java     | 154 +++++++++++-------
 1 file changed, 92 insertions(+), 62 deletions(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index fbd2150ff4..39f483000b 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -120,7 +120,7 @@ final class TikaTextExtractor extends TextExtractor {
 
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
 
-    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
+    private final ExecutorService executorService = Executors.newSingleThreadExecutor();
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
 
     private final AutoDetectParser parser = new AutoDetectParser();
@@ -132,6 +132,7 @@ final class TikaTextExtractor extends TextExtractor {
     private static final File TESSERACT_PATH = locateTesseractExecutable();
     private static final String LANGUAGE_PACKS = getLanguagePacks();
     private ProcessTerminator processTerminator;
+    private static final String TESSERACT_OUTPUT_FILE_NAME = "output";
 
     private static final List<String> TIKA_SUPPORTED_TYPES
             = new Tika().getParser().getSupportedTypes(new ParseContext())
@@ -143,6 +144,18 @@ final class TikaTextExtractor extends TextExtractor {
         this.content = content;
     }
 
+    /**
+     * If Tesseract has been installed and is set to be used through
+     * configuration, then ocr is enabled. OCR can only currently be run on
+     * Windows OS.
+     *
+     * @return Flag indicating if OCR is set to be used.
+     */
+    private boolean ocrEnabled() {
+        return TESSERACT_PATH != null && tesseractOCREnabled
+                && PlatformUtil.isWindowsOS() == true;
+    }
+
     /**
      * Returns a reader that will iterate over the text extracted from Apache
      * Tika.
@@ -156,12 +169,48 @@ final class TikaTextExtractor extends TextExtractor {
      */
     @Override
     public Reader getReader() throws ExtractionException {
-        InputStream stream = new ReadContentInputStream(content);
+        InputStream stream = null;
 
-        Metadata metadata = new Metadata();
         ParseContext parseContext = new ParseContext();
         parseContext.set(Parser.class, parser);
 
+        if (ocrEnabled() && content instanceof AbstractFile) {
+            AbstractFile file = ((AbstractFile) content);
+            //Run OCR on images with Tesseract directly. 
+            if (file.getMIMEType().toLowerCase().startsWith("image/")) {
+                stream = runOcrAndGetOutputStream(file);
+            } else {
+                //Otherwise, go through Tika for PDFs so that it can
+                //extract images and run Tesseract on them.     
+                PDFParserConfig pdfConfig = new PDFParserConfig();
+
+                // Extracting the inline images and letting Tesseract run on each inline image.
+                // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
+                // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
+                pdfConfig.setExtractInlineImages(true);
+                // Multiple pages within a PDF file might refer to the same underlying image.
+                pdfConfig.setExtractUniqueInlineImagesOnly(true);
+                parseContext.set(PDFParserConfig.class, pdfConfig);
+
+                // Configure Tesseract parser to perform OCR
+                TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+                String tesseractFolder = TESSERACT_PATH.getParent();
+                ocrConfig.setTesseractPath(tesseractFolder);
+                /*
+                 * Tesseract expects language data packs to be in a
+                 * subdirectory of tesseractFolder, in a folder called
+                 * "tessdata". If they are stored somewhere else, use
+                 * ocrConfig.setTessdataPath(String tessdataPath) to point
+                 * to them
+                 */
+                ocrConfig.setLanguage(LANGUAGE_PACKS);
+                parseContext.set(TesseractOCRConfig.class, ocrConfig);
+            }
+        } else {
+            stream = new ReadContentInputStream(content);
+        }
+
+        Metadata metadata = new Metadata();
         // Use the more memory efficient Tika SAX parsers for DOCX and
         // PPTX files (it already uses SAX for XLSX).
         OfficeParserConfig officeParserConfig = new OfficeParserConfig();
@@ -169,63 +218,39 @@ final class TikaTextExtractor extends TextExtractor {
         officeParserConfig.setUseSAXDocxExtractor(true);
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
-        //If Tesseract has been and installed and is set to be used....
-        if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
-            if (content instanceof AbstractFile) {
-                AbstractFile file = ((AbstractFile) content);
-                //Run OCR on images with Tesseract directly. 
-                //Reassign the stream we will send to Tika to point to the
-                //output file produced by Tesseract.
-                if (file.getMIMEType().toLowerCase().contains("image")) {
-                    stream = runOcrAndGetOutputStream(file);
-                } else {
-                    //Otherwise, go through Tika for PDFs so that it can
-                    //extract images and run Tesseract on them.     
-                    PDFParserConfig pdfConfig = new PDFParserConfig();
-
-                    // Extracting the inline images and letting Tesseract run on each inline image.
-                    // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
-                    // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
-                    pdfConfig.setExtractInlineImages(true);
-                    // Multiple pages within a PDF file might refer to the same underlying image.
-                    pdfConfig.setExtractUniqueInlineImagesOnly(true);
-                    parseContext.set(PDFParserConfig.class, pdfConfig);
-
-                    // Configure Tesseract parser to perform OCR
-                    TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
-                    String tesseractFolder = TESSERACT_PATH.getParent();
-                    ocrConfig.setTesseractPath(tesseractFolder);
-                    // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
-                    // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
-                    ocrConfig.setLanguage(LANGUAGE_PACKS);
-                    parseContext.set(TesseractOCRConfig.class, ocrConfig);
-                }
-            }
-        }
-
         //Make the creation of a TikaReader a cancellable future in case it takes too long
-        Future<Reader> future = tikaParseExecutor.submit(new GetTikaReader(parser, stream, metadata, parseContext));
+        Future<Reader> future = executorService.submit(
+                new GetTikaReader(parser, stream, metadata, parseContext));
         try {
-            final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
+            final Reader tikaReader = future.get(getTimeout(content.getSize()),
+                    TimeUnit.SECONDS);
             //check if the reader is empty
             PushbackReader pushbackReader = new PushbackReader(tikaReader);
             int read = pushbackReader.read();
             if (read == -1) {
-                throw new ExtractionException("Unable to extract text: Tika returned empty reader for " + content);
+                throw new ExtractionException("Unable to extract text: "
+                        + "Tika returned empty reader for " + content);
             }
             pushbackReader.unread(read);
 
             //concatenate parsed content and meta data into a single reader.
             CharSource metaDataCharSource = getMetaDataCharSource(metadata);
-            return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
+            return CharSource.concat(new ReaderCharSource(pushbackReader),
+                    metaDataCharSource).openStream();
         } catch (TimeoutException te) {
-            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
+            final String msg = NbBundle.getMessage(this.getClass(),
+                    "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
+                    content.getId(), content.getName());
             throw new ExtractionException(msg, te);
         } catch (ExtractionException ex) {
             throw ex;
         } catch (Exception ex) {
-            tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
-            final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
+            tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the "
+                    + "content" + content.getId() + ": " + content.getName(),
+                    ex.getCause()); //NON-NLS
+            final String msg = NbBundle.getMessage(this.getClass(),
+                    "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
+                    content.getId(), content.getName());
             throw new ExtractionException(msg, ex);
         } finally {
             future.cancel(true);
@@ -246,26 +271,24 @@ final class TikaTextExtractor extends TextExtractor {
         File inputFile = null;
         File outputFile = null;
         try {
-            //Write file to temp directory
-            String localDiskPath = Case.getCurrentCaseThrows().getTempDirectory()
-                    + File.separator + file.getId() + file.getName();
-            inputFile = new File(localDiskPath);
+            //Appending file id makes the name unique
+            String tempFileName = file.getId() + file.getName();
+            inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
+                    tempFileName).toFile();
             ContentUtils.writeToFile(content, inputFile);
 
+            String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME;
+            String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
+                    tempOutputName).toString();
+            String executeablePath = TESSERACT_PATH.toString();
+
             //Build tesseract commands
             ProcessBuilder process = new ProcessBuilder();
-            String outputFilePath = Case.getCurrentCaseThrows().getTempDirectory()
-                    + File.separator + file.getId() + "output";
-
-            String executeablePath = TESSERACT_PATH.toString();
             process.command(executeablePath,
-                    //Source image path
                     String.format("\"%s\"", inputFile.getAbsolutePath()),
-                    //Output path
                     String.format("\"%s\"", outputFilePath),
                     //language pack command flag
-                    "-l",
-                    LANGUAGE_PACKS);
+                    "-l", LANGUAGE_PACKS);
 
             //If the ProcessTerminator was supplied during 
             //configuration apply it here.
@@ -274,12 +297,9 @@ final class TikaTextExtractor extends TextExtractor {
             } else {
                 ExecUtil.execute(process);
             }
-
-            //Open an input stream on the output file to send to tika.
-            //Tesseract spits out a .txt file
+            
             outputFile = new File(outputFilePath + ".txt");
-            //When CleanUpStream is closed, it automatically 
-            //deletes the outputFile in the temp directory.
+            //Open a stream of the Tesseract text file and send this to Tika
             return new CleanUpStream(outputFile);
         } catch (NoCurrentCaseException | IOException ex) {
             if (outputFile != null) {
@@ -298,7 +318,6 @@ final class TikaTextExtractor extends TextExtractor {
      * cancelled.
      */
     private class GetTikaReader implements Callable<Reader> {
-
         private final AutoDetectParser parser;
         private final InputStream stream;
         private final Metadata metadata;
@@ -327,11 +346,22 @@ final class TikaTextExtractor extends TextExtractor {
 
         private File file;
 
+        /**
+         * Store a reference to file on construction
+         * 
+         * @param file
+         * @throws FileNotFoundException 
+         */
         public CleanUpStream(File file) throws FileNotFoundException {
             super(file);
             this.file = file;
         }
 
+        /**
+         * Delete this underlying file when close is called.
+         * 
+         * @throws IOException 
+         */
         @Override
         public void close() throws IOException {
             try {

From 7c6b21783049b820e4951edafaa3bcd26419b331 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 14 Dec 2018 08:48:16 -0500
Subject: [PATCH 3/4] Fixed PDF bug

---
 .../org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 39f483000b..5827f85fc3 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -205,6 +205,8 @@ final class TikaTextExtractor extends TextExtractor {
                  */
                 ocrConfig.setLanguage(LANGUAGE_PACKS);
                 parseContext.set(TesseractOCRConfig.class, ocrConfig);
+                
+                stream = new ReadContentInputStream(content);
             }
         } else {
             stream = new ReadContentInputStream(content);

From b2a258e78fc01ceace31740c0e8567711728e873 Mon Sep 17 00:00:00 2001
From: "U-BASIS\\dsmyda" <dsmyda@win-dsmyd-4990.basistech.net>
Date: Fri, 14 Dec 2018 09:38:56 -0500
Subject: [PATCH 4/4] Made thread factory have named threads

---
 .../sleuthkit/autopsy/textextractors/TikaTextExtractor.java | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
index 5827f85fc3..8c0ced3135 100644
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.textextractors;
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.io.CharSource;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -36,6 +37,7 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.concurrent.ThreadFactory;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.logging.Level;
@@ -120,7 +122,9 @@ final class TikaTextExtractor extends TextExtractor {
 
     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
 
-    private final ExecutorService executorService = Executors.newSingleThreadExecutor();
+    private final ThreadFactory tikaThreadFactory = 
+            new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
+    private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
 
     private final AutoDetectParser parser = new AutoDetectParser();