From 839c766d9a8a563b645810d8485abea000f781f6 Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dsmyda" Date: Thu, 13 Dec 2018 14:05:56 -0500 Subject: [PATCH 1/4] Pulled the Tesseract use out of Tika and allow for them to be cancelled --- .../textextractors/TikaTextExtractor.java | 190 +++++++++++++++--- 1 file changed, 163 insertions(+), 27 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index 9b766a9e9e..fbd2150ff4 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -21,7 +21,10 @@ package org.sleuthkit.autopsy.textextractors; import com.google.common.collect.ImmutableList; import com.google.common.io.CharSource; import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.io.PushbackReader; import java.io.Reader; import java.nio.file.Paths; @@ -29,6 +32,7 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -50,8 +54,14 @@ import org.apache.tika.parser.pdf.PDFParserConfig; import org.openide.util.NbBundle; import org.openide.modules.InstalledFileLocator; import org.openide.util.Lookup; +import org.sleuthkit.autopsy.casemodule.Case; +import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; +import org.sleuthkit.autopsy.coreutils.ExecUtil; +import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator; import org.sleuthkit.autopsy.coreutils.PlatformUtil; +import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.textextractors.extractionconfigs.ImageFileExtractionConfig; +import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -121,6 +131,7 @@ final class TikaTextExtractor extends TextExtractor { private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final String LANGUAGE_PACKS = getLanguagePacks(); + private ProcessTerminator processTerminator; private static final List TIKA_SUPPORTED_TYPES = new Tika().getParser().getSupportedTypes(new ParseContext()) @@ -145,7 +156,7 @@ final class TikaTextExtractor extends TextExtractor { */ @Override public Reader getReader() throws ExtractionException { - ReadContentInputStream stream = new ReadContentInputStream(content); + InputStream stream = new ReadContentInputStream(content); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); @@ -158,35 +169,44 @@ final class TikaTextExtractor extends TextExtractor { officeParserConfig.setUseSAXDocxExtractor(true); parseContext.set(OfficeParserConfig.class, officeParserConfig); - // configure OCR if it is enabled in KWS settings and installed on the machine + //If Tesseract has been and installed and is set to be used.... if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) { + if (content instanceof AbstractFile) { + AbstractFile file = ((AbstractFile) content); + //Run OCR on images with Tesseract directly. + //Reassign the stream we will send to Tika to point to the + //output file produced by Tesseract. + if (file.getMIMEType().toLowerCase().contains("image")) { + stream = runOcrAndGetOutputStream(file); + } else { + //Otherwise, go through Tika for PDFs so that it can + //extract images and run Tesseract on them. + PDFParserConfig pdfConfig = new PDFParserConfig(); - // configure PDFParser. - PDFParserConfig pdfConfig = new PDFParserConfig(); + // Extracting the inline images and letting Tesseract run on each inline image. + // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29 + // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html + pdfConfig.setExtractInlineImages(true); + // Multiple pages within a PDF file might refer to the same underlying image. + pdfConfig.setExtractUniqueInlineImagesOnly(true); + parseContext.set(PDFParserConfig.class, pdfConfig); - // Extracting the inline images and letting Tesseract run on each inline image. - // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29 - // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html - pdfConfig.setExtractInlineImages(true); - // Multiple pages within a PDF file might refer to the same underlying image. - pdfConfig.setExtractUniqueInlineImagesOnly(true); - parseContext.set(PDFParserConfig.class, pdfConfig); - - // Configure Tesseract parser to perform OCR - TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); - String tesseractFolder = TESSERACT_PATH.getParent(); - ocrConfig.setTesseractPath(tesseractFolder); - // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata". - // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them - ocrConfig.setLanguage(LANGUAGE_PACKS); - parseContext.set(TesseractOCRConfig.class, ocrConfig); + // Configure Tesseract parser to perform OCR + TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); + String tesseractFolder = TESSERACT_PATH.getParent(); + ocrConfig.setTesseractPath(tesseractFolder); + // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata". + // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them + ocrConfig.setLanguage(LANGUAGE_PACKS); + parseContext.set(TesseractOCRConfig.class, ocrConfig); + } + } } - //Parse the file in a task, a convenient way to have a timeout... - final Future future = tikaParseExecutor.submit(() -> new ParsingReader(parser, stream, metadata, parseContext)); + //Make the creation of a TikaReader a cancellable future in case it takes too long + Future future = tikaParseExecutor.submit(new GetTikaReader(parser, stream, metadata, parseContext)); try { final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS); - //check if the reader is empty PushbackReader pushbackReader = new PushbackReader(tikaReader); int read = pushbackReader.read(); @@ -212,6 +232,119 @@ final class TikaTextExtractor extends TextExtractor { } } + /** + * Run OCR and return the file stream produced by Tesseract. + * + * @param file Image file to run OCR on + * + * @return InputStream connected to the output file that Tesseract produced. + * + * @throws + * org.sleuthkit.autopsy.textextractors.TextExtractor.ExtractionException + */ + private InputStream runOcrAndGetOutputStream(AbstractFile file) throws ExtractionException { + File inputFile = null; + File outputFile = null; + try { + //Write file to temp directory + String localDiskPath = Case.getCurrentCaseThrows().getTempDirectory() + + File.separator + file.getId() + file.getName(); + inputFile = new File(localDiskPath); + ContentUtils.writeToFile(content, inputFile); + + //Build tesseract commands + ProcessBuilder process = new ProcessBuilder(); + String outputFilePath = Case.getCurrentCaseThrows().getTempDirectory() + + File.separator + file.getId() + "output"; + + String executeablePath = TESSERACT_PATH.toString(); + process.command(executeablePath, + //Source image path + String.format("\"%s\"", inputFile.getAbsolutePath()), + //Output path + String.format("\"%s\"", outputFilePath), + //language pack command flag + "-l", + LANGUAGE_PACKS); + + //If the ProcessTerminator was supplied during + //configuration apply it here. + if (processTerminator != null) { + ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator); + } else { + ExecUtil.execute(process); + } + + //Open an input stream on the output file to send to tika. + //Tesseract spits out a .txt file + outputFile = new File(outputFilePath + ".txt"); + //When CleanUpStream is closed, it automatically + //deletes the outputFile in the temp directory. + return new CleanUpStream(outputFile); + } catch (NoCurrentCaseException | IOException ex) { + if (outputFile != null) { + outputFile.delete(); + } + throw new ExtractionException("Could not successfully run Tesseract", ex); + } finally { + if (inputFile != null) { + inputFile.delete(); + } + } + } + + /** + * Wraps the creation of a TikaReader into a Future so that it can be + * cancelled. + */ + private class GetTikaReader implements Callable { + + private final AutoDetectParser parser; + private final InputStream stream; + private final Metadata metadata; + private final ParseContext parseContext; + + public GetTikaReader(AutoDetectParser parser, InputStream stream, + Metadata metadata, ParseContext parseContext) { + this.parser = parser; + this.stream = stream; + this.metadata = metadata; + this.parseContext = parseContext; + } + + @Override + public Reader call() throws Exception { + return new ParsingReader(parser, stream, metadata, parseContext); + } + } + + /** + * Automatically deletes the underlying File when the close() method is + * called. This is used to delete the Output file produced from Tesseract + * once it has been read by Tika. + */ + private class CleanUpStream extends FileInputStream { + + private File file; + + public CleanUpStream(File file) throws FileNotFoundException { + super(file); + this.file = file; + } + + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + if (file != null) { + file.delete(); + file = null; + } + } + } + } + /** * Finds and returns the path to the Tesseract executable, if able. * @@ -339,12 +472,15 @@ final class TikaTextExtractor extends TextExtractor { public void setExtractionSettings(Lookup context) { if (context != null) { ImageFileExtractionConfig configInstance = context.lookup(ImageFileExtractionConfig.class); - if (configInstance == null) { - return; - } - if (Objects.nonNull(configInstance.getOCREnabled())) { + + if (configInstance != null && Objects.nonNull(configInstance.getOCREnabled())) { this.tesseractOCREnabled = configInstance.getOCREnabled(); } + + ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class); + if (terminatorInstance != null) { + this.processTerminator = terminatorInstance; + } } } From a965b50b1678e8f63b13b79fa1f94ed69634d819 Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dsmyda" Date: Fri, 14 Dec 2018 08:18:19 -0500 Subject: [PATCH 2/4] Did code review suggestions and fixed Codacy stuff --- .../textextractors/TikaTextExtractor.java | 154 +++++++++++------- 1 file changed, 92 insertions(+), 62 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index fbd2150ff4..39f483000b 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -120,7 +120,7 @@ final class TikaTextExtractor extends TextExtractor { private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS - private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); + private final ExecutorService executorService = Executors.newSingleThreadExecutor(); private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private final AutoDetectParser parser = new AutoDetectParser(); @@ -132,6 +132,7 @@ final class TikaTextExtractor extends TextExtractor { private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final String LANGUAGE_PACKS = getLanguagePacks(); private ProcessTerminator processTerminator; + private static final String TESSERACT_OUTPUT_FILE_NAME = "output"; private static final List TIKA_SUPPORTED_TYPES = new Tika().getParser().getSupportedTypes(new ParseContext()) @@ -143,6 +144,18 @@ final class TikaTextExtractor extends TextExtractor { this.content = content; } + /** + * If Tesseract has been installed and is set to be used through + * configuration, then ocr is enabled. OCR can only currently be run on + * Windows OS. + * + * @return Flag indicating if OCR is set to be used. + */ + private boolean ocrEnabled() { + return TESSERACT_PATH != null && tesseractOCREnabled + && PlatformUtil.isWindowsOS() == true; + } + /** * Returns a reader that will iterate over the text extracted from Apache * Tika. @@ -156,12 +169,48 @@ final class TikaTextExtractor extends TextExtractor { */ @Override public Reader getReader() throws ExtractionException { - InputStream stream = new ReadContentInputStream(content); + InputStream stream = null; - Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); + if (ocrEnabled() && content instanceof AbstractFile) { + AbstractFile file = ((AbstractFile) content); + //Run OCR on images with Tesseract directly. + if (file.getMIMEType().toLowerCase().startsWith("image/")) { + stream = runOcrAndGetOutputStream(file); + } else { + //Otherwise, go through Tika for PDFs so that it can + //extract images and run Tesseract on them. + PDFParserConfig pdfConfig = new PDFParserConfig(); + + // Extracting the inline images and letting Tesseract run on each inline image. + // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29 + // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html + pdfConfig.setExtractInlineImages(true); + // Multiple pages within a PDF file might refer to the same underlying image. + pdfConfig.setExtractUniqueInlineImagesOnly(true); + parseContext.set(PDFParserConfig.class, pdfConfig); + + // Configure Tesseract parser to perform OCR + TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); + String tesseractFolder = TESSERACT_PATH.getParent(); + ocrConfig.setTesseractPath(tesseractFolder); + /* + * Tesseract expects language data packs to be in a + * subdirectory of tesseractFolder, in a folder called + * "tessdata". If they are stored somewhere else, use + * ocrConfig.setTessdataPath(String tessdataPath) to point + * to them + */ + ocrConfig.setLanguage(LANGUAGE_PACKS); + parseContext.set(TesseractOCRConfig.class, ocrConfig); + } + } else { + stream = new ReadContentInputStream(content); + } + + Metadata metadata = new Metadata(); // Use the more memory efficient Tika SAX parsers for DOCX and // PPTX files (it already uses SAX for XLSX). OfficeParserConfig officeParserConfig = new OfficeParserConfig(); @@ -169,63 +218,39 @@ final class TikaTextExtractor extends TextExtractor { officeParserConfig.setUseSAXDocxExtractor(true); parseContext.set(OfficeParserConfig.class, officeParserConfig); - //If Tesseract has been and installed and is set to be used.... - if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) { - if (content instanceof AbstractFile) { - AbstractFile file = ((AbstractFile) content); - //Run OCR on images with Tesseract directly. - //Reassign the stream we will send to Tika to point to the - //output file produced by Tesseract. - if (file.getMIMEType().toLowerCase().contains("image")) { - stream = runOcrAndGetOutputStream(file); - } else { - //Otherwise, go through Tika for PDFs so that it can - //extract images and run Tesseract on them. - PDFParserConfig pdfConfig = new PDFParserConfig(); - - // Extracting the inline images and letting Tesseract run on each inline image. - // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29 - // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html - pdfConfig.setExtractInlineImages(true); - // Multiple pages within a PDF file might refer to the same underlying image. - pdfConfig.setExtractUniqueInlineImagesOnly(true); - parseContext.set(PDFParserConfig.class, pdfConfig); - - // Configure Tesseract parser to perform OCR - TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); - String tesseractFolder = TESSERACT_PATH.getParent(); - ocrConfig.setTesseractPath(tesseractFolder); - // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata". - // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them - ocrConfig.setLanguage(LANGUAGE_PACKS); - parseContext.set(TesseractOCRConfig.class, ocrConfig); - } - } - } - //Make the creation of a TikaReader a cancellable future in case it takes too long - Future future = tikaParseExecutor.submit(new GetTikaReader(parser, stream, metadata, parseContext)); + Future future = executorService.submit( + new GetTikaReader(parser, stream, metadata, parseContext)); try { - final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS); + final Reader tikaReader = future.get(getTimeout(content.getSize()), + TimeUnit.SECONDS); //check if the reader is empty PushbackReader pushbackReader = new PushbackReader(tikaReader); int read = pushbackReader.read(); if (read == -1) { - throw new ExtractionException("Unable to extract text: Tika returned empty reader for " + content); + throw new ExtractionException("Unable to extract text: " + + "Tika returned empty reader for " + content); } pushbackReader.unread(read); //concatenate parsed content and meta data into a single reader. CharSource metaDataCharSource = getMetaDataCharSource(metadata); - return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream(); + return CharSource.concat(new ReaderCharSource(pushbackReader), + metaDataCharSource).openStream(); } catch (TimeoutException te) { - final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName()); + final String msg = NbBundle.getMessage(this.getClass(), + "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", + content.getId(), content.getName()); throw new ExtractionException(msg, te); } catch (ExtractionException ex) { throw ex; } catch (Exception ex) { - tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS - final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName()); + tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the " + + "content" + content.getId() + ": " + content.getName(), + ex.getCause()); //NON-NLS + final String msg = NbBundle.getMessage(this.getClass(), + "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", + content.getId(), content.getName()); throw new ExtractionException(msg, ex); } finally { future.cancel(true); @@ -246,26 +271,24 @@ final class TikaTextExtractor extends TextExtractor { File inputFile = null; File outputFile = null; try { - //Write file to temp directory - String localDiskPath = Case.getCurrentCaseThrows().getTempDirectory() - + File.separator + file.getId() + file.getName(); - inputFile = new File(localDiskPath); + //Appending file id makes the name unique + String tempFileName = file.getId() + file.getName(); + inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(), + tempFileName).toFile(); ContentUtils.writeToFile(content, inputFile); + String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME; + String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(), + tempOutputName).toString(); + String executeablePath = TESSERACT_PATH.toString(); + //Build tesseract commands ProcessBuilder process = new ProcessBuilder(); - String outputFilePath = Case.getCurrentCaseThrows().getTempDirectory() - + File.separator + file.getId() + "output"; - - String executeablePath = TESSERACT_PATH.toString(); process.command(executeablePath, - //Source image path String.format("\"%s\"", inputFile.getAbsolutePath()), - //Output path String.format("\"%s\"", outputFilePath), //language pack command flag - "-l", - LANGUAGE_PACKS); + "-l", LANGUAGE_PACKS); //If the ProcessTerminator was supplied during //configuration apply it here. @@ -274,12 +297,9 @@ final class TikaTextExtractor extends TextExtractor { } else { ExecUtil.execute(process); } - - //Open an input stream on the output file to send to tika. - //Tesseract spits out a .txt file + outputFile = new File(outputFilePath + ".txt"); - //When CleanUpStream is closed, it automatically - //deletes the outputFile in the temp directory. + //Open a stream of the Tesseract text file and send this to Tika return new CleanUpStream(outputFile); } catch (NoCurrentCaseException | IOException ex) { if (outputFile != null) { @@ -298,7 +318,6 @@ final class TikaTextExtractor extends TextExtractor { * cancelled. */ private class GetTikaReader implements Callable { - private final AutoDetectParser parser; private final InputStream stream; private final Metadata metadata; @@ -327,11 +346,22 @@ final class TikaTextExtractor extends TextExtractor { private File file; + /** + * Store a reference to file on construction + * + * @param file + * @throws FileNotFoundException + */ public CleanUpStream(File file) throws FileNotFoundException { super(file); this.file = file; } + /** + * Delete this underlying file when close is called. + * + * @throws IOException + */ @Override public void close() throws IOException { try { From 7c6b21783049b820e4951edafaa3bcd26419b331 Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dsmyda" Date: Fri, 14 Dec 2018 08:48:16 -0500 Subject: [PATCH 3/4] Fixed PDF bug --- .../org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index 39f483000b..5827f85fc3 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -205,6 +205,8 @@ final class TikaTextExtractor extends TextExtractor { */ ocrConfig.setLanguage(LANGUAGE_PACKS); parseContext.set(TesseractOCRConfig.class, ocrConfig); + + stream = new ReadContentInputStream(content); } } else { stream = new ReadContentInputStream(content); From b2a258e78fc01ceace31740c0e8567711728e873 Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dsmyda" Date: Fri, 14 Dec 2018 09:38:56 -0500 Subject: [PATCH 4/4] Made thread factory have named threads --- .../sleuthkit/autopsy/textextractors/TikaTextExtractor.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index 5827f85fc3..8c0ced3135 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.textextractors; import com.google.common.collect.ImmutableList; import com.google.common.io.CharSource; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -36,6 +37,7 @@ import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.logging.Level; @@ -120,7 +122,9 @@ final class TikaTextExtractor extends TextExtractor { private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS - private final ExecutorService executorService = Executors.newSingleThreadExecutor(); + private final ThreadFactory tikaThreadFactory = + new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build(); + private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory); private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private final AutoDetectParser parser = new AutoDetectParser();