Merge pull request #4031 from sleuthkit/release-4.8.0

Merge in updated release 4.8.0
2025-07-06 21:00:22 +00:00 · 2018-08-08 14:49:25 -04:00 · 2018-08-08 14:49:25 -04:00 · 7983b872e3
commit 7983b872e3
parent f672bf7740 3372c38afc
3 changed files with 29 additions and 29 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -570,7 +570,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
            }

-            if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt"))) {
+            if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
+                //Carved Files should be the only type of unallocated files capable of a txt extension and 
+                //should be ignored by the TextFileExtractor because they may contain more than one text encoding
                try {
                    if (Ingester.getDefault().indexText(txtFileExtractor, aFile, context)) {
                        putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextFileExtractor.java
@ -17,8 +17,9 @@
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
-
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.BufferedInputStream;
 import java.io.Reader;
 import java.util.logging.Level;
 import org.apache.tika.parser.txt.CharsetDetector;
@ -53,15 +54,16 @@ final class TextFileExtractor extends ContentTextExtractor {
    @Override
    public Reader getReader(Content source) throws TextExtractorException {
        CharsetDetector detector = new CharsetDetector();
-        ReadContentInputStream stream = new ReadContentInputStream(source);
+        //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
+        InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
        try {
            detector.setText(stream);
        } catch (IOException ex) {
-            throw new TextExtractorException("Unable to get string from detected text in UnicodeTextExtractor", ex);
+            throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
        }
        CharsetMatch match = detector.detect();
        if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
-            throw new TextExtractorException("Text does not match any character set with a high enough confidence for UnicodeTextExtractor");
+            throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
        }

        return match.getReader();
--- a/NEWS.txt
+++ b/NEWS.txt
@ -1,29 +1,25 @@
 ---------------- VERSION 4.8.0  --------------
 New Features:
- The case tree view can now be grouped by data source.
- Added a common files search tool that finds all instances of a file in a case.
- Text extraction optionally includes optical character recognition (OCR).
- Data source(s) filter added to ad hoc keyword search and file search by 
-attributes.
- SQLite tables can be now be exported to CSV files.
- User defined tags now appear first in tagging menus.
- Eliminated one tagging sub menu layer for faster tagging.
- Added Replace Tag item to tagging menus (shortcut for delete tag, add tag).
- The Other Occurrences content viewer now shows matches in the current case.
- A listing of cases in the central repository is displayed by the 
-central repository options panel.
- An interesting file artifact is now created when a "zip bomb" is detected.
- Text and queries sent to Solr are now normalized to handle diacritics, 
-ligatures, narrow and wide width Japanese characters, etc.
- An object detection ingest module that uses OpenCV and user-supplied
-classifiers has been added to the "experimental" Net Beans Module (NBM).
- A data source processor that runs Volatility on a memory image has been
-added to the "experimental" NBM.
- Comments can be added to all files (file correlation properties) recorded 
-in the central repository using a results view context menu item.
- Comments can be added to all correlation properties recorded 
-in the central repository using an Other Occurrences results content viewer 
-context menu item.
+- Data Source Grouping:
+-- The case tree view can now be grouped by data source.
+-- Keyword and file search can now be restricted to a data source.
+- Central Repository / Corrrelation:
+-- New common files search feature that finds files that exist in multiple devices in the same case. 
+-- The Other Occurrences content viewer now shows matches in the current case (in addition to central repository).
+-- Central repository options panel now shows cases that are in repo.
+- A comment about a file can be created and saved in the central repository so that future cases and see it. 
+- Keyword Search:
+-- Can enable OCR text extraction of PDF and JPG files using Tesseract. 
+-- Keyword search module normalizes Unicode text. 
+-- Keyword search module uses ICU to convert text files that do not have a BOM.
+- Tagging: 
+-- Tagging menu changed to have user defined tags at top and "quick tag" removed one level of menus.
+-- New "Replace Tag" feature to change the tag on an item.
+- Other:
+-- SQLite tables can be now be exported to CSV files.
+-- An interesting file artifact is now created when a "zip bomb" is detected.
+-- An object detection ingest module was added to the Experimental module. It requires an OpenCV trained model. 
+

 Bug Fixes:
 - Expanding the case tree is more efficient.