cleanup

2025-07-17 10:17:41 +00:00 · 2019-08-14 17:32:35 -04:00 · 2019-08-14 17:32:35 -04:00 · 5d46793d45
commit 5d46793d45
parent f8545851e4
2 changed files with 16 additions and 12 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -628,6 +628,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {

            boolean wasTextAdded = false;

+            Charset decodetectCharset = null;
            //extract text with one of the extractors, divide into chunks and index with Solr
            try {
                //logger.log(Level.INFO, "indexing: " + aFile.getName());
@ -638,13 +639,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                    extractStringsAndIndex(aFile);
                    return;
                }
-                if (fileType.equals(MimeTypes.PLAIN_TEXT)) {
-                    Charset detectedCharset = TextExtractor.getDecodetectCharset(aFile);
-                    if (detectedCharset != null) {
-                        indexTextFile(aFile);
+                decodetectCharset = TextExtractor.getDecodetectCharset(aFile);
+                if (fileType.equals(MimeTypes.PLAIN_TEXT) && decodetectCharset != null) {
+                    indexTextFile(aFile, decodetectCharset);
                    return;
                }
-                }
                if (!extractTextAndIndex(aFile)) {
                    // Text extractor not found for file. Extract string only.
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
@ -666,7 +665,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
                //Carved Files should be the only type of unallocated files capable of a txt extension and 
                //should be ignored by the TextFileExtractor because they may contain more than one text encoding
-                wasTextAdded = indexTextFile(aFile);
+                wasTextAdded = indexTextFile(aFile, decodetectCharset);
            }

            // if it wasn't supported or had an error, default to strings
@ -675,9 +674,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            }
        }

-        private boolean indexTextFile(AbstractFile aFile) {
+        private boolean indexTextFile(AbstractFile aFile, Charset detectedCharset) {
            try {
-                TextFileExtractor textFileExtractor = new TextFileExtractor();
+                TextFileExtractor textFileExtractor = new TextFileExtractor(detectedCharset);
                Reader textReader = textFileExtractor.getReader(aFile);
                if (textReader == null) {
                    logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@ -42,12 +42,17 @@ final class TextFileExtractor {
    //files while hopefully working on all files with a valid text encoding
    static final private int MIN_MATCH_CONFIDENCE = 20;

+    private final Charset detectedCharset;
+
+    TextFileExtractor(Charset detectedCharset) {
+        this.detectedCharset = detectedCharset;
+    }
+
    public Reader getReader(AbstractFile source) throws TextFileExtractorException {
        String mimeType = source.getMIMEType();
        if (mimeType.equals(MimeTypes.PLAIN_TEXT)) {
-            Charset decodetectCharset = TextExtractor.getDecodetectCharset(source);
-            if (decodetectCharset != null) {
-                return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), decodetectCharset);
+            if (detectedCharset != null) {
+                return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(source)), detectedCharset);
            }
        }