Did code review suggestions and fixed Codacy stuff

This commit is contained in:
U-BASIS\dsmyda 2018-12-14 08:18:19 -05:00
parent 54bc85b829
commit a965b50b16

View File

@ -120,7 +120,7 @@ final class TikaTextExtractor extends TextExtractor {
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private final ExecutorService executorService = Executors.newSingleThreadExecutor();
private static final String SQLITE_MIMETYPE = "application/x-sqlite3"; private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
private final AutoDetectParser parser = new AutoDetectParser(); private final AutoDetectParser parser = new AutoDetectParser();
@ -132,6 +132,7 @@ final class TikaTextExtractor extends TextExtractor {
private static final File TESSERACT_PATH = locateTesseractExecutable(); private static final File TESSERACT_PATH = locateTesseractExecutable();
private static final String LANGUAGE_PACKS = getLanguagePacks(); private static final String LANGUAGE_PACKS = getLanguagePacks();
private ProcessTerminator processTerminator; private ProcessTerminator processTerminator;
private static final String TESSERACT_OUTPUT_FILE_NAME = "output";
private static final List<String> TIKA_SUPPORTED_TYPES private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext()) = new Tika().getParser().getSupportedTypes(new ParseContext())
@ -143,6 +144,18 @@ final class TikaTextExtractor extends TextExtractor {
this.content = content; this.content = content;
} }
/**
* If Tesseract has been installed and is set to be used through
* configuration, then ocr is enabled. OCR can only currently be run on
* Windows OS.
*
* @return Flag indicating if OCR is set to be used.
*/
private boolean ocrEnabled() {
return TESSERACT_PATH != null && tesseractOCREnabled
&& PlatformUtil.isWindowsOS() == true;
}
/** /**
* Returns a reader that will iterate over the text extracted from Apache * Returns a reader that will iterate over the text extracted from Apache
* Tika. * Tika.
@ -156,12 +169,48 @@ final class TikaTextExtractor extends TextExtractor {
*/ */
@Override @Override
public Reader getReader() throws ExtractionException { public Reader getReader() throws ExtractionException {
InputStream stream = new ReadContentInputStream(content); InputStream stream = null;
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext(); ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser); parseContext.set(Parser.class, parser);
if (ocrEnabled() && content instanceof AbstractFile) {
AbstractFile file = ((AbstractFile) content);
//Run OCR on images with Tesseract directly.
if (file.getMIMEType().toLowerCase().startsWith("image/")) {
stream = runOcrAndGetOutputStream(file);
} else {
//Otherwise, go through Tika for PDFs so that it can
//extract images and run Tesseract on them.
PDFParserConfig pdfConfig = new PDFParserConfig();
// Extracting the inline images and letting Tesseract run on each inline image.
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
pdfConfig.setExtractInlineImages(true);
// Multiple pages within a PDF file might refer to the same underlying image.
pdfConfig.setExtractUniqueInlineImagesOnly(true);
parseContext.set(PDFParserConfig.class, pdfConfig);
// Configure Tesseract parser to perform OCR
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder);
/*
* Tesseract expects language data packs to be in a
* subdirectory of tesseractFolder, in a folder called
* "tessdata". If they are stored somewhere else, use
* ocrConfig.setTessdataPath(String tessdataPath) to point
* to them
*/
ocrConfig.setLanguage(LANGUAGE_PACKS);
parseContext.set(TesseractOCRConfig.class, ocrConfig);
}
} else {
stream = new ReadContentInputStream(content);
}
Metadata metadata = new Metadata();
// Use the more memory efficient Tika SAX parsers for DOCX and // Use the more memory efficient Tika SAX parsers for DOCX and
// PPTX files (it already uses SAX for XLSX). // PPTX files (it already uses SAX for XLSX).
OfficeParserConfig officeParserConfig = new OfficeParserConfig(); OfficeParserConfig officeParserConfig = new OfficeParserConfig();
@ -169,63 +218,39 @@ final class TikaTextExtractor extends TextExtractor {
officeParserConfig.setUseSAXDocxExtractor(true); officeParserConfig.setUseSAXDocxExtractor(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig); parseContext.set(OfficeParserConfig.class, officeParserConfig);
//If Tesseract has been and installed and is set to be used....
if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
if (content instanceof AbstractFile) {
AbstractFile file = ((AbstractFile) content);
//Run OCR on images with Tesseract directly.
//Reassign the stream we will send to Tika to point to the
//output file produced by Tesseract.
if (file.getMIMEType().toLowerCase().contains("image")) {
stream = runOcrAndGetOutputStream(file);
} else {
//Otherwise, go through Tika for PDFs so that it can
//extract images and run Tesseract on them.
PDFParserConfig pdfConfig = new PDFParserConfig();
// Extracting the inline images and letting Tesseract run on each inline image.
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
pdfConfig.setExtractInlineImages(true);
// Multiple pages within a PDF file might refer to the same underlying image.
pdfConfig.setExtractUniqueInlineImagesOnly(true);
parseContext.set(PDFParserConfig.class, pdfConfig);
// Configure Tesseract parser to perform OCR
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
String tesseractFolder = TESSERACT_PATH.getParent();
ocrConfig.setTesseractPath(tesseractFolder);
// Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
// If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
ocrConfig.setLanguage(LANGUAGE_PACKS);
parseContext.set(TesseractOCRConfig.class, ocrConfig);
}
}
}
//Make the creation of a TikaReader a cancellable future in case it takes too long //Make the creation of a TikaReader a cancellable future in case it takes too long
Future<Reader> future = tikaParseExecutor.submit(new GetTikaReader(parser, stream, metadata, parseContext)); Future<Reader> future = executorService.submit(
new GetTikaReader(parser, stream, metadata, parseContext));
try { try {
final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS); final Reader tikaReader = future.get(getTimeout(content.getSize()),
TimeUnit.SECONDS);
//check if the reader is empty //check if the reader is empty
PushbackReader pushbackReader = new PushbackReader(tikaReader); PushbackReader pushbackReader = new PushbackReader(tikaReader);
int read = pushbackReader.read(); int read = pushbackReader.read();
if (read == -1) { if (read == -1) {
throw new ExtractionException("Unable to extract text: Tika returned empty reader for " + content); throw new ExtractionException("Unable to extract text: "
+ "Tika returned empty reader for " + content);
} }
pushbackReader.unread(read); pushbackReader.unread(read);
//concatenate parsed content and meta data into a single reader. //concatenate parsed content and meta data into a single reader.
CharSource metaDataCharSource = getMetaDataCharSource(metadata); CharSource metaDataCharSource = getMetaDataCharSource(metadata);
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream(); return CharSource.concat(new ReaderCharSource(pushbackReader),
metaDataCharSource).openStream();
} catch (TimeoutException te) { } catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName()); final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
content.getId(), content.getName());
throw new ExtractionException(msg, te); throw new ExtractionException(msg, te);
} catch (ExtractionException ex) { } catch (ExtractionException ex) {
throw ex; throw ex;
} catch (Exception ex) { } catch (Exception ex) {
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the "
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName()); + "content" + content.getId() + ": " + content.getName(),
ex.getCause()); //NON-NLS
final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
content.getId(), content.getName());
throw new ExtractionException(msg, ex); throw new ExtractionException(msg, ex);
} finally { } finally {
future.cancel(true); future.cancel(true);
@ -246,26 +271,24 @@ final class TikaTextExtractor extends TextExtractor {
File inputFile = null; File inputFile = null;
File outputFile = null; File outputFile = null;
try { try {
//Write file to temp directory //Appending file id makes the name unique
String localDiskPath = Case.getCurrentCaseThrows().getTempDirectory() String tempFileName = file.getId() + file.getName();
+ File.separator + file.getId() + file.getName(); inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
inputFile = new File(localDiskPath); tempFileName).toFile();
ContentUtils.writeToFile(content, inputFile); ContentUtils.writeToFile(content, inputFile);
String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME;
String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
tempOutputName).toString();
String executeablePath = TESSERACT_PATH.toString();
//Build tesseract commands //Build tesseract commands
ProcessBuilder process = new ProcessBuilder(); ProcessBuilder process = new ProcessBuilder();
String outputFilePath = Case.getCurrentCaseThrows().getTempDirectory()
+ File.separator + file.getId() + "output";
String executeablePath = TESSERACT_PATH.toString();
process.command(executeablePath, process.command(executeablePath,
//Source image path
String.format("\"%s\"", inputFile.getAbsolutePath()), String.format("\"%s\"", inputFile.getAbsolutePath()),
//Output path
String.format("\"%s\"", outputFilePath), String.format("\"%s\"", outputFilePath),
//language pack command flag //language pack command flag
"-l", "-l", LANGUAGE_PACKS);
LANGUAGE_PACKS);
//If the ProcessTerminator was supplied during //If the ProcessTerminator was supplied during
//configuration apply it here. //configuration apply it here.
@ -274,12 +297,9 @@ final class TikaTextExtractor extends TextExtractor {
} else { } else {
ExecUtil.execute(process); ExecUtil.execute(process);
} }
//Open an input stream on the output file to send to tika.
//Tesseract spits out a .txt file
outputFile = new File(outputFilePath + ".txt"); outputFile = new File(outputFilePath + ".txt");
//When CleanUpStream is closed, it automatically //Open a stream of the Tesseract text file and send this to Tika
//deletes the outputFile in the temp directory.
return new CleanUpStream(outputFile); return new CleanUpStream(outputFile);
} catch (NoCurrentCaseException | IOException ex) { } catch (NoCurrentCaseException | IOException ex) {
if (outputFile != null) { if (outputFile != null) {
@ -298,7 +318,6 @@ final class TikaTextExtractor extends TextExtractor {
* cancelled. * cancelled.
*/ */
private class GetTikaReader implements Callable<Reader> { private class GetTikaReader implements Callable<Reader> {
private final AutoDetectParser parser; private final AutoDetectParser parser;
private final InputStream stream; private final InputStream stream;
private final Metadata metadata; private final Metadata metadata;
@ -327,11 +346,22 @@ final class TikaTextExtractor extends TextExtractor {
private File file; private File file;
/**
* Store a reference to file on construction
*
* @param file
* @throws FileNotFoundException
*/
public CleanUpStream(File file) throws FileNotFoundException { public CleanUpStream(File file) throws FileNotFoundException {
super(file); super(file);
this.file = file; this.file = file;
} }
/**
* Delete this underlying file when close is called.
*
* @throws IOException
*/
@Override @Override
public void close() throws IOException { public void close() throws IOException {
try { try {