From a914e4b76e0e3981af00d11da182957a18c47e7f Mon Sep 17 00:00:00 2001 From: Ethan Roseman Date: Thu, 21 Nov 2019 10:42:17 +0900 Subject: [PATCH] Further refactoring --- .../autopsy/textextractors/TextExtractor.java | 11 -------- .../textextractors/TextExtractorFactory.java | 1 + .../textextractors/TextFileExtractor.java | 27 +++++++++++++------ .../KeywordSearchIngestModule.java | 6 ++--- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java index 294e6b41af..5e9d99e065 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractor.java @@ -18,21 +18,10 @@ */ package org.sleuthkit.autopsy.textextractors; -import com.ethteck.decodetect.core.Decodetect; -import com.ethteck.decodetect.core.DecodetectResult; -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; import java.io.Reader; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CharsetEncoder; import java.util.Collections; -import java.util.List; import java.util.Map; import org.openide.util.Lookup; -import org.sleuthkit.datamodel.Content; -import org.sleuthkit.datamodel.ReadContentInputStream; /** * Extracts the text out of Content instances and exposes them as a Reader. diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java index 2c8316ba60..ff0ba51dd1 100755 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java @@ -89,6 +89,7 @@ public class TextExtractorFactory { */ private static List getFileExtractors(AbstractFile content, Lookup context) { List fileExtractors = Arrays.asList( + new TextFileExtractor(content), new HtmlTextExtractor(content), new SqliteTextExtractor(content), new TikaTextExtractor(content)); diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java index af24f94165..b0e77eb9ba 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TextFileExtractor.java @@ -35,9 +35,9 @@ import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ReadContentInputStream; /** - * Extract text from .txt files + * Extract text from text files */ -public final class TextFileExtractor { +public final class TextFileExtractor implements TextExtractor { public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) { @Override public boolean contains(Charset cs) { @@ -55,18 +55,29 @@ public final class TextFileExtractor { } }; - public Reader getReader(AbstractFile source) throws TextFileExtractorException { - Charset encoding = getEncoding(source); + private final AbstractFile file; + + public TextFileExtractor(AbstractFile file) { + this.file = file; + } + + public Reader getReader() { + Charset encoding = getEncoding(file); if (encoding == UNKNOWN_CHARSET) { encoding = StandardCharsets.UTF_8; } - return getReader(source, encoding); + return getReader(encoding); } - public Reader getReader(AbstractFile source, Charset encoding) throws TextFileExtractorException { - return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), encoding); + public Reader getReader(Charset encoding) { + return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding); } - + + @Override + public boolean isSupported() { + return file.getMIMEType().equals("text/plain"); + } + public class TextFileExtractorException extends Exception { public TextFileExtractorException(String msg, Throwable ex) { super(msg, ex); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index 50788306a2..67e6d5076c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -682,8 +682,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule { */ private boolean indexTextFile(AbstractFile aFile) { try { - TextFileExtractor textFileExtractor = new TextFileExtractor(); - Reader textReader = textFileExtractor.getReader(aFile); + TextFileExtractor textFileExtractor = new TextFileExtractor(aFile); + Reader textReader = textFileExtractor.getReader(); if (textReader == null) { logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName()); } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) { @@ -692,8 +692,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule { } } catch (IngesterException ex) { logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex); - } catch (TextFileExtractorException ex) { - logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex); } return false; }