Further refactoring

This commit is contained in:
Ethan Roseman 2019-11-21 10:42:17 +09:00
parent 5f144cdbc6
commit a914e4b76e
4 changed files with 22 additions and 23 deletions

View File

@ -18,21 +18,10 @@
*/
package org.sleuthkit.autopsy.textextractors;
import com.ethteck.decodetect.core.Decodetect;
import com.ethteck.decodetect.core.DecodetectResult;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.openide.util.Lookup;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extracts the text out of Content instances and exposes them as a Reader.

View File

@ -89,6 +89,7 @@ public class TextExtractorFactory {
*/
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
List<TextExtractor> fileExtractors = Arrays.asList(
new TextFileExtractor(content),
new HtmlTextExtractor(content),
new SqliteTextExtractor(content),
new TikaTextExtractor(content));

View File

@ -35,9 +35,9 @@ import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extract text from .txt files
* Extract text from text files
*/
public final class TextFileExtractor {
public final class TextFileExtractor implements TextExtractor {
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
@Override
public boolean contains(Charset cs) {
@ -55,18 +55,29 @@ public final class TextFileExtractor {
}
};
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
Charset encoding = getEncoding(source);
private final AbstractFile file;
public TextFileExtractor(AbstractFile file) {
this.file = file;
}
public Reader getReader() {
Charset encoding = getEncoding(file);
if (encoding == UNKNOWN_CHARSET) {
encoding = StandardCharsets.UTF_8;
}
return getReader(source, encoding);
return getReader(encoding);
}
public Reader getReader(AbstractFile source, Charset encoding) throws TextFileExtractorException {
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), encoding);
public Reader getReader(Charset encoding) {
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
}
@Override
public boolean isSupported() {
return file.getMIMEType().equals("text/plain");
}
public class TextFileExtractorException extends Exception {
public TextFileExtractorException(String msg, Throwable ex) {
super(msg, ex);

View File

@ -682,8 +682,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
*/
private boolean indexTextFile(AbstractFile aFile) {
try {
TextFileExtractor textFileExtractor = new TextFileExtractor();
Reader textReader = textFileExtractor.getReader(aFile);
TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
Reader textReader = textFileExtractor.getReader();
if (textReader == null) {
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
@ -692,8 +692,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
}
} catch (IngesterException ex) {
logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
} catch (TextFileExtractorException ex) {
logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
}
return false;
}