mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Did code review suggestions and fixed Codacy stuff
This commit is contained in:
parent
54bc85b829
commit
a965b50b16
@ -120,7 +120,7 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
|
|
||||||
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
|
||||||
|
|
||||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||||
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
|
||||||
|
|
||||||
private final AutoDetectParser parser = new AutoDetectParser();
|
private final AutoDetectParser parser = new AutoDetectParser();
|
||||||
@ -132,6 +132,7 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
private static final File TESSERACT_PATH = locateTesseractExecutable();
|
private static final File TESSERACT_PATH = locateTesseractExecutable();
|
||||||
private static final String LANGUAGE_PACKS = getLanguagePacks();
|
private static final String LANGUAGE_PACKS = getLanguagePacks();
|
||||||
private ProcessTerminator processTerminator;
|
private ProcessTerminator processTerminator;
|
||||||
|
private static final String TESSERACT_OUTPUT_FILE_NAME = "output";
|
||||||
|
|
||||||
private static final List<String> TIKA_SUPPORTED_TYPES
|
private static final List<String> TIKA_SUPPORTED_TYPES
|
||||||
= new Tika().getParser().getSupportedTypes(new ParseContext())
|
= new Tika().getParser().getSupportedTypes(new ParseContext())
|
||||||
@ -143,6 +144,18 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
this.content = content;
|
this.content = content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If Tesseract has been installed and is set to be used through
|
||||||
|
* configuration, then ocr is enabled. OCR can only currently be run on
|
||||||
|
* Windows OS.
|
||||||
|
*
|
||||||
|
* @return Flag indicating if OCR is set to be used.
|
||||||
|
*/
|
||||||
|
private boolean ocrEnabled() {
|
||||||
|
return TESSERACT_PATH != null && tesseractOCREnabled
|
||||||
|
&& PlatformUtil.isWindowsOS() == true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a reader that will iterate over the text extracted from Apache
|
* Returns a reader that will iterate over the text extracted from Apache
|
||||||
* Tika.
|
* Tika.
|
||||||
@ -156,12 +169,48 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader() throws ExtractionException {
|
public Reader getReader() throws ExtractionException {
|
||||||
InputStream stream = new ReadContentInputStream(content);
|
InputStream stream = null;
|
||||||
|
|
||||||
Metadata metadata = new Metadata();
|
|
||||||
ParseContext parseContext = new ParseContext();
|
ParseContext parseContext = new ParseContext();
|
||||||
parseContext.set(Parser.class, parser);
|
parseContext.set(Parser.class, parser);
|
||||||
|
|
||||||
|
if (ocrEnabled() && content instanceof AbstractFile) {
|
||||||
|
AbstractFile file = ((AbstractFile) content);
|
||||||
|
//Run OCR on images with Tesseract directly.
|
||||||
|
if (file.getMIMEType().toLowerCase().startsWith("image/")) {
|
||||||
|
stream = runOcrAndGetOutputStream(file);
|
||||||
|
} else {
|
||||||
|
//Otherwise, go through Tika for PDFs so that it can
|
||||||
|
//extract images and run Tesseract on them.
|
||||||
|
PDFParserConfig pdfConfig = new PDFParserConfig();
|
||||||
|
|
||||||
|
// Extracting the inline images and letting Tesseract run on each inline image.
|
||||||
|
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
|
||||||
|
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
|
||||||
|
pdfConfig.setExtractInlineImages(true);
|
||||||
|
// Multiple pages within a PDF file might refer to the same underlying image.
|
||||||
|
pdfConfig.setExtractUniqueInlineImagesOnly(true);
|
||||||
|
parseContext.set(PDFParserConfig.class, pdfConfig);
|
||||||
|
|
||||||
|
// Configure Tesseract parser to perform OCR
|
||||||
|
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||||||
|
String tesseractFolder = TESSERACT_PATH.getParent();
|
||||||
|
ocrConfig.setTesseractPath(tesseractFolder);
|
||||||
|
/*
|
||||||
|
* Tesseract expects language data packs to be in a
|
||||||
|
* subdirectory of tesseractFolder, in a folder called
|
||||||
|
* "tessdata". If they are stored somewhere else, use
|
||||||
|
* ocrConfig.setTessdataPath(String tessdataPath) to point
|
||||||
|
* to them
|
||||||
|
*/
|
||||||
|
ocrConfig.setLanguage(LANGUAGE_PACKS);
|
||||||
|
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
stream = new ReadContentInputStream(content);
|
||||||
|
}
|
||||||
|
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
// Use the more memory efficient Tika SAX parsers for DOCX and
|
// Use the more memory efficient Tika SAX parsers for DOCX and
|
||||||
// PPTX files (it already uses SAX for XLSX).
|
// PPTX files (it already uses SAX for XLSX).
|
||||||
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
|
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
|
||||||
@ -169,63 +218,39 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
officeParserConfig.setUseSAXDocxExtractor(true);
|
officeParserConfig.setUseSAXDocxExtractor(true);
|
||||||
parseContext.set(OfficeParserConfig.class, officeParserConfig);
|
parseContext.set(OfficeParserConfig.class, officeParserConfig);
|
||||||
|
|
||||||
//If Tesseract has been and installed and is set to be used....
|
|
||||||
if (TESSERACT_PATH != null && tesseractOCREnabled && PlatformUtil.isWindowsOS() == true) {
|
|
||||||
if (content instanceof AbstractFile) {
|
|
||||||
AbstractFile file = ((AbstractFile) content);
|
|
||||||
//Run OCR on images with Tesseract directly.
|
|
||||||
//Reassign the stream we will send to Tika to point to the
|
|
||||||
//output file produced by Tesseract.
|
|
||||||
if (file.getMIMEType().toLowerCase().contains("image")) {
|
|
||||||
stream = runOcrAndGetOutputStream(file);
|
|
||||||
} else {
|
|
||||||
//Otherwise, go through Tika for PDFs so that it can
|
|
||||||
//extract images and run Tesseract on them.
|
|
||||||
PDFParserConfig pdfConfig = new PDFParserConfig();
|
|
||||||
|
|
||||||
// Extracting the inline images and letting Tesseract run on each inline image.
|
|
||||||
// https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
|
|
||||||
// https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
|
|
||||||
pdfConfig.setExtractInlineImages(true);
|
|
||||||
// Multiple pages within a PDF file might refer to the same underlying image.
|
|
||||||
pdfConfig.setExtractUniqueInlineImagesOnly(true);
|
|
||||||
parseContext.set(PDFParserConfig.class, pdfConfig);
|
|
||||||
|
|
||||||
// Configure Tesseract parser to perform OCR
|
|
||||||
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
|
||||||
String tesseractFolder = TESSERACT_PATH.getParent();
|
|
||||||
ocrConfig.setTesseractPath(tesseractFolder);
|
|
||||||
// Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
|
|
||||||
// If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
|
|
||||||
ocrConfig.setLanguage(LANGUAGE_PACKS);
|
|
||||||
parseContext.set(TesseractOCRConfig.class, ocrConfig);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//Make the creation of a TikaReader a cancellable future in case it takes too long
|
//Make the creation of a TikaReader a cancellable future in case it takes too long
|
||||||
Future<Reader> future = tikaParseExecutor.submit(new GetTikaReader(parser, stream, metadata, parseContext));
|
Future<Reader> future = executorService.submit(
|
||||||
|
new GetTikaReader(parser, stream, metadata, parseContext));
|
||||||
try {
|
try {
|
||||||
final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
|
final Reader tikaReader = future.get(getTimeout(content.getSize()),
|
||||||
|
TimeUnit.SECONDS);
|
||||||
//check if the reader is empty
|
//check if the reader is empty
|
||||||
PushbackReader pushbackReader = new PushbackReader(tikaReader);
|
PushbackReader pushbackReader = new PushbackReader(tikaReader);
|
||||||
int read = pushbackReader.read();
|
int read = pushbackReader.read();
|
||||||
if (read == -1) {
|
if (read == -1) {
|
||||||
throw new ExtractionException("Unable to extract text: Tika returned empty reader for " + content);
|
throw new ExtractionException("Unable to extract text: "
|
||||||
|
+ "Tika returned empty reader for " + content);
|
||||||
}
|
}
|
||||||
pushbackReader.unread(read);
|
pushbackReader.unread(read);
|
||||||
|
|
||||||
//concatenate parsed content and meta data into a single reader.
|
//concatenate parsed content and meta data into a single reader.
|
||||||
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
|
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
|
||||||
return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
|
return CharSource.concat(new ReaderCharSource(pushbackReader),
|
||||||
|
metaDataCharSource).openStream();
|
||||||
} catch (TimeoutException te) {
|
} catch (TimeoutException te) {
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
|
final String msg = NbBundle.getMessage(this.getClass(),
|
||||||
|
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
|
||||||
|
content.getId(), content.getName());
|
||||||
throw new ExtractionException(msg, te);
|
throw new ExtractionException(msg, te);
|
||||||
} catch (ExtractionException ex) {
|
} catch (ExtractionException ex) {
|
||||||
throw ex;
|
throw ex;
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
|
tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the "
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
|
+ "content" + content.getId() + ": " + content.getName(),
|
||||||
|
ex.getCause()); //NON-NLS
|
||||||
|
final String msg = NbBundle.getMessage(this.getClass(),
|
||||||
|
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
|
||||||
|
content.getId(), content.getName());
|
||||||
throw new ExtractionException(msg, ex);
|
throw new ExtractionException(msg, ex);
|
||||||
} finally {
|
} finally {
|
||||||
future.cancel(true);
|
future.cancel(true);
|
||||||
@ -246,26 +271,24 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
File inputFile = null;
|
File inputFile = null;
|
||||||
File outputFile = null;
|
File outputFile = null;
|
||||||
try {
|
try {
|
||||||
//Write file to temp directory
|
//Appending file id makes the name unique
|
||||||
String localDiskPath = Case.getCurrentCaseThrows().getTempDirectory()
|
String tempFileName = file.getId() + file.getName();
|
||||||
+ File.separator + file.getId() + file.getName();
|
inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
|
||||||
inputFile = new File(localDiskPath);
|
tempFileName).toFile();
|
||||||
ContentUtils.writeToFile(content, inputFile);
|
ContentUtils.writeToFile(content, inputFile);
|
||||||
|
|
||||||
|
String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME;
|
||||||
|
String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
|
||||||
|
tempOutputName).toString();
|
||||||
|
String executeablePath = TESSERACT_PATH.toString();
|
||||||
|
|
||||||
//Build tesseract commands
|
//Build tesseract commands
|
||||||
ProcessBuilder process = new ProcessBuilder();
|
ProcessBuilder process = new ProcessBuilder();
|
||||||
String outputFilePath = Case.getCurrentCaseThrows().getTempDirectory()
|
|
||||||
+ File.separator + file.getId() + "output";
|
|
||||||
|
|
||||||
String executeablePath = TESSERACT_PATH.toString();
|
|
||||||
process.command(executeablePath,
|
process.command(executeablePath,
|
||||||
//Source image path
|
|
||||||
String.format("\"%s\"", inputFile.getAbsolutePath()),
|
String.format("\"%s\"", inputFile.getAbsolutePath()),
|
||||||
//Output path
|
|
||||||
String.format("\"%s\"", outputFilePath),
|
String.format("\"%s\"", outputFilePath),
|
||||||
//language pack command flag
|
//language pack command flag
|
||||||
"-l",
|
"-l", LANGUAGE_PACKS);
|
||||||
LANGUAGE_PACKS);
|
|
||||||
|
|
||||||
//If the ProcessTerminator was supplied during
|
//If the ProcessTerminator was supplied during
|
||||||
//configuration apply it here.
|
//configuration apply it here.
|
||||||
@ -274,12 +297,9 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
} else {
|
} else {
|
||||||
ExecUtil.execute(process);
|
ExecUtil.execute(process);
|
||||||
}
|
}
|
||||||
|
|
||||||
//Open an input stream on the output file to send to tika.
|
|
||||||
//Tesseract spits out a .txt file
|
|
||||||
outputFile = new File(outputFilePath + ".txt");
|
outputFile = new File(outputFilePath + ".txt");
|
||||||
//When CleanUpStream is closed, it automatically
|
//Open a stream of the Tesseract text file and send this to Tika
|
||||||
//deletes the outputFile in the temp directory.
|
|
||||||
return new CleanUpStream(outputFile);
|
return new CleanUpStream(outputFile);
|
||||||
} catch (NoCurrentCaseException | IOException ex) {
|
} catch (NoCurrentCaseException | IOException ex) {
|
||||||
if (outputFile != null) {
|
if (outputFile != null) {
|
||||||
@ -298,7 +318,6 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
* cancelled.
|
* cancelled.
|
||||||
*/
|
*/
|
||||||
private class GetTikaReader implements Callable<Reader> {
|
private class GetTikaReader implements Callable<Reader> {
|
||||||
|
|
||||||
private final AutoDetectParser parser;
|
private final AutoDetectParser parser;
|
||||||
private final InputStream stream;
|
private final InputStream stream;
|
||||||
private final Metadata metadata;
|
private final Metadata metadata;
|
||||||
@ -327,11 +346,22 @@ final class TikaTextExtractor extends TextExtractor {
|
|||||||
|
|
||||||
private File file;
|
private File file;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store a reference to file on construction
|
||||||
|
*
|
||||||
|
* @param file
|
||||||
|
* @throws FileNotFoundException
|
||||||
|
*/
|
||||||
public CleanUpStream(File file) throws FileNotFoundException {
|
public CleanUpStream(File file) throws FileNotFoundException {
|
||||||
super(file);
|
super(file);
|
||||||
this.file = file;
|
this.file = file;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete this underlying file when close is called.
|
||||||
|
*
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
try {
|
try {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user