Merge pull request #4631 from wschaeferB/4865-FixNpeInTextFileExtractor

4865 fix npe in text file extractor
2025-07-06 21:00:22 +00:00 · 2019-03-21 09:02:55 -04:00 · 2019-03-21 09:02:55 -04:00 · 50595543dc
commit 50595543dc
parent 5d24cec81a 962bfc2467
2 changed files with 24 additions and 18 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2011-2018 Basis Technology Corp.
+ * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -25,7 +25,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
-import org.openide.util.Exceptions;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
@ -71,9 +70,11 @@ import org.sleuthkit.datamodel.TskData.FileKnown;
    "CannotRunFileTypeDetection=Unable to run file type detection."
 })
 public final class KeywordSearchIngestModule implements FileIngestModule {
-    
-    /** generally text extractors should ignore archives and let unpacking
-     * modules take care of them */
+
+    /**
+     * generally text extractors should ignore archives and let unpacking
+     * modules take care of them
+     */
    private static final List<String> ARCHIVE_MIME_TYPES
            = ImmutableList.of(
                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
@ -108,7 +109,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                    "application/x-lzop", //NON-NLS
                    "application/x-z", //NON-NLS
                    "application/x-compress"); //NON-NLS
-    
+
    /**
     * Options for this extractor
     */
@ -117,7 +118,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
        EXTRACT_UTF8, ///< extract UTF8 text, true/false
    };

-
    enum UpdateFrequency {

        FAST(20),
@ -290,15 +290,15 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                }
            }
        }
-        
+
        StringsConfig stringsConfig = new StringsConfig();
        Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
        stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
        stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
        stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
-        
+
        stringsExtractionContext = Lookups.fixed(stringsConfig);
-        
+
        indexer = new Indexer();
        initialized = true;
    }
@ -482,12 +482,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
            imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
            ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
            Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
-            
+
            try {
-                TextExtractor extractor = TextExtractorFactory.getExtractor(aFile,extractionContext);
+                TextExtractor extractor = TextExtractorFactory.getExtractor(aFile, extractionContext);
                Reader extractedTextReader = extractor.getReader();
                //divide into chunks and index
-                return Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, context);
+                return Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, context);
            } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
                //No text extractor found... run the default instead
                return false;
@ -509,7 +509,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                }
                TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
                Reader extractedTextReader = stringsExtractor.getReader();
-                if (Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
+                if (Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
                    return true;
                } else {
@ -619,12 +619,16 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                try {
                    TextFileExtractor textFileExtractor = new TextFileExtractor();
                    Reader textReader = textFileExtractor.getReader(aFile);
-                    if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
+                    if (textReader == null) {
+                        logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
+                    } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
                        putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
                        wasTextAdded = true;
                    }
-                } catch (IngesterException | TextFileExtractorException ex) {
+                } catch (IngesterException ex) {
                    logger.log(Level.WARNING, "Unable to index as unicode", ex);
+                } catch (TextFileExtractorException ex) {
+                    logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
                }
            }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@ -1,7 +1,7 @@
 /*
 * Autopsy Forensic Browser
 *
- * Copyright 2018 Basis Technology Corp.
+ * Copyright 2018-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -48,7 +48,9 @@ final class TextFileExtractor {
            throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
        }
        CharsetMatch match = detector.detect();
-        if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
+        if (match == null) {
+            throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
+        } else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
            throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
        }