From 5d46793d45572d81d07ee0c755f7ac6f3b42a27f Mon Sep 17 00:00:00 2001 From: Ethan Roseman Date: Wed, 14 Aug 2019 17:32:35 -0400 Subject: [PATCH] cleanup --- .../KeywordSearchIngestModule.java | 17 ++++++++--------- .../keywordsearch/TextFileExtractor.java | 11 ++++++++--- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index 119deb8a71..131145943d 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -628,6 +628,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { boolean wasTextAdded = false; + Charset decodetectCharset = null; //extract text with one of the extractors, divide into chunks and index with Solr try { //logger.log(Level.INFO, "indexing: " + aFile.getName()); @@ -638,12 +639,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule { extractStringsAndIndex(aFile); return; } - if (fileType.equals(MimeTypes.PLAIN_TEXT)) { - Charset detectedCharset = TextExtractor.getDecodetectCharset(aFile); - if (detectedCharset != null) { - indexTextFile(aFile); - return; - } + decodetectCharset = TextExtractor.getDecodetectCharset(aFile); + if (fileType.equals(MimeTypes.PLAIN_TEXT) && decodetectCharset != null) { + indexTextFile(aFile, decodetectCharset); + return; } if (!extractTextAndIndex(aFile)) { // Text extractor not found for file. Extract string only. @@ -666,7 +665,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) { //Carved Files should be the only type of unallocated files capable of a txt extension and //should be ignored by the TextFileExtractor because they may contain more than one text encoding - wasTextAdded = indexTextFile(aFile); + wasTextAdded = indexTextFile(aFile, decodetectCharset); } // if it wasn't supported or had an error, default to strings @@ -675,9 +674,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule { } } - private boolean indexTextFile(AbstractFile aFile) { + private boolean indexTextFile(AbstractFile aFile, Charset detectedCharset) { try { - TextFileExtractor textFileExtractor = new TextFileExtractor(); + TextFileExtractor textFileExtractor = new TextFileExtractor(detectedCharset); Reader textReader = textFileExtractor.getReader(aFile); if (textReader == null) { logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName()); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java index 1ea5e4d3ed..139645dab7 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java @@ -42,12 +42,17 @@ final class TextFileExtractor { //files while hopefully working on all files with a valid text encoding static final private int MIN_MATCH_CONFIDENCE = 20; + private final Charset detectedCharset; + + TextFileExtractor(Charset detectedCharset) { + this.detectedCharset = detectedCharset; + } + public Reader getReader(AbstractFile source) throws TextFileExtractorException { String mimeType = source.getMIMEType(); if (mimeType.equals(MimeTypes.PLAIN_TEXT)) { - Charset decodetectCharset = TextExtractor.getDecodetectCharset(source); - if (decodetectCharset != null) { - return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), decodetectCharset); + if (detectedCharset != null) { + return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), detectedCharset); } }