Merge https://github.com/sleuthkit/autopsy into miscBugFixes

2025-07-14 17:06:16 +00:00 · 2013-10-30 10:08:33 -04:00 · 2013-10-30 10:08:33 -04:00 · f47aac4056
commit f47aac4056
parent 12dc09a081 8c0786c848
6 changed files with 96 additions and 53 deletions
--- a/BUILDING.txt
+++ b/BUILDING.txt
@ -11,8 +11,7 @@ correct C libraries.
 STEPS:
 1) Get Java Setup

-1a) Download and install 32-bit version of JDK version 1.7 (32-bit is currently
-needed even if you have a 64-bit system).
+1a) Download and install JDK version 1.7.  You can now use 32-bit or 64-bit, but special work is needed to get The Sleuth Kit to compile as 64-bit.  So, 32-bit is easier. 

 Autopsy has been used and tested with Oracle JavaSE and the included JavaFX support
 (http://www.oracle.com/technetwork/java/javase/downloads/index.html).
@ -26,7 +25,8 @@ Note: Netbeans IDE is not required to build and run Autopsy,
 but it is a recommended IDE to use for development of Autopsy modules.

 1d) (optional) If you are going to package Autopsy, then you'll also
-need to set JRE_HOME to the root JRE directory. 
+need to set JRE_HOME_32 to the root 32-bit JRE directory and/or JRE_HOME_64
+to the root 64-bit JRE directory. 

 1e) (optional) For some Autopsy features to be functional, you need to add java executable to the system PATH.

@ -37,6 +37,9 @@ need to set JRE_HOME to the root JRE directory.
 later).  All you need is the dll file.  Note that you will get a
 launching error if you use libewf 1.
 - http://sourceforge.net/projects/libewf/
+If you want to build the 64-bit version of The Sleuth Kit, download
+our 64-bit version of libewf:
+- https://github.com/sleuthkit/libewf_64bit

 2b) Set LIBEWF_HOME environment variable to root directory of LIBEWF

@ -97,13 +100,13 @@ BACKGROUND:
 Here are some notes to shed some light on what is going on during
 the build process.

- NetBeans uses ant to build Autopsy.  The build target locates TSK
-(and LIBEWF) based on the environment variables and copies the
-needed JAR and library files into the DataModel module in the Autopsy
-project (see build-unix.xml and build-windows.xml in the root
-directory for details).   If you want to use the debug version of
-the TSK dll, then edit the copy line in the build-windows.xml file
-to copy from the Debug folder.
+- The Sleuth Kit Java datamodel JAR file has native libraries
+that are copied into it. 
+
+- NetBeans uses ant to build Autopsy.  The build target copies the
+TSK datamodel JAR file into the project.  If you want to use the
+debug version of the TSK dll, then there is a different ant target
+in TSK to copy the debug versions of the dlls. 

 - On a Windows system, the ant target copies all needed libraries
 to the autopsy folder.  On a Unix system, the ant taget copies only
--- a/KeywordSearch/release/solr/solr/conf/schema.xml
+++ b/KeywordSearch/release/solr/solr/conf/schema.xml
@ -510,12 +510,13 @@
   <!-- use image_id to easily search a specific image only -->
   <field name="image_id" type="string" indexed="true" stored="false" required="true" /> 
   
-    <!-- Autopsy pushes text to the content field and gets the text to display from it.  It is copied to other places -->
+    <!-- Autopsy pushes text to this field and gets the text to display from it.  It is copied to other places -->
   <field name="content" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
   
   <!-- The strings field holds strings extracted from files that SolrCell doesn't support -->
   <!--<field name="strings" type="text_general" indexed="true" stored="true"/>-->
   
+   <!-- NOTE: file_name gets copied later to other fields for searching -->
   <field name="file_name" type="text_general" indexed="false" stored="true"/>
   <field name="ctime" type="tdate" indexed="false" stored="false"/>
   <field name="atime" type="tdate" indexed="false" stored="false"/>
@ -555,7 +556,7 @@
   <!-- field with white-space tokenized words for TermsComponent regex search (useful for fast search of IP addresses, URLs, certain phone numbers)
 		also be useful for Lucene based queries containing special characters-->
   <!-- populated via copyField -->
-   <field name="content_ws" type="text_ws" indexed="true" stored="false" /> 
+   <field name="content_ws" type="text_ws" indexed="true" stored="false" multiValued="true" /> 
 	
   <!-- Uncommenting the following will create a "timestamp" field using
        a default value of "NOW" to indicate when each document was indexed.
@ -628,7 +629,10 @@
   <copyField source="file_name" dest="text"/>
   <copyField source="meta" dest="text"/>
   <!--<copyField source="strings" dest="text"/>-->
+   
   <copyField source="content" dest="content_ws"/>
+   <copyField source="file_name" dest="content_ws"/>
+   <copyField source="meta" dest="content_ws"/>
 	
   <!-- Above, multiple source fields are copied to the [text] field. 
 	  Another way to map multiple source fields to the same 
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -183,6 +183,9 @@ public class Ingester {
        return fsc.accept(getContentFieldsV);
    }

+    /**
+     * Visitor used to create param list to send to SOLR index.
+     */
    private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {

        private SleuthkitCase curCase = null;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -27,7 +27,6 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@ -44,9 +43,9 @@ import org.netbeans.api.progress.aggregate.AggregateProgressFactory;
 import org.netbeans.api.progress.aggregate.AggregateProgressHandle;
 import org.netbeans.api.progress.aggregate.ProgressContributor;
 import org.openide.util.Cancellable;
-import org.openide.util.Exceptions;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.EscapeUtil;
+import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
 import org.sleuthkit.autopsy.coreutils.StopWatch;
 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
 import org.sleuthkit.autopsy.ingest.PipelineContext;
@ -61,8 +60,6 @@ import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.AbstractFile;
-import org.sleuthkit.datamodel.Content;
-import org.sleuthkit.datamodel.Image;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.sleuthkit.datamodel.SleuthkitCase;
 import org.sleuthkit.datamodel.TskCoreException;
@ -137,8 +134,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
    private enum IngestStatus {
        TEXT_INGESTED,   /// Text was extracted by knowing file type and text_ingested
        STRINGS_INGESTED, ///< Strings were extracted from file 
-        SKIPPED,    ///< File was skipped for whatever reason
-        METADATA_INGESTED   ///< No content, so we just text_ingested metadata
+        METADATA_INGESTED,   ///< No content, so we just text_ingested metadata
+        SKIPPED_ERROR_INDEXING, ///< File was skipped because index engine had problems
+        SKIPPED_ERROR_TEXTEXTRACT, ///< File was skipped because of text extraction issues
+        SKIPPED_ERROR_IO    ///< File was skipped because of IO issues reading it
    };
    private Map<Long, IngestStatus> ingestStatus;

@ -164,7 +163,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
        if (initialized == false) //error initializing indexing/Solr
        {
            logger.log(Level.WARNING, "Skipping processing, module not initialized, file: " + abstractFile.getName());
-            ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED);
+            ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
            return ProcessResult.OK;
        }
        try {
@ -176,24 +175,25 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
            logger.log(Level.SEVERE, "Error getting image id of file processed by keyword search: " + abstractFile.getName(), ex);
        }
        
+        if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
+            //skip indexing of virtual dirs (no content, no real name) - will index children files
+            return ProcessResult.OK;
+        }
+
        //check if we should index meta-data only when 1) it is known 2) HashDb module errored on it
-        IngestModuleAbstractFile.ProcessResult hashDBResult = services.getAbstractFileModuleResult(hashDBModuleName);
-        //logger.log(Level.INFO, "hashdb result: " + hashDBResult + "file: " + AbstractFile.getName());
-        if (hashDBResult == IngestModuleAbstractFile.ProcessResult.ERROR) {
-            //index meta-data only
+        if (services.getAbstractFileModuleResult(hashDBModuleName) == IngestModuleAbstractFile.ProcessResult.ERROR) {
            indexer.indexFile(abstractFile, false);
            //notify depending module that keyword search (would) encountered error for this file
-            ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED);
+            ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED_ERROR_IO);
            return ProcessResult.ERROR;
-        } else if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
+        } 
+        else if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
            //index meta-data only
            indexer.indexFile(abstractFile, false);
            return ProcessResult.OK;
        }

-        if (processedFiles == false) {
-            processedFiles = true;
-        }
+        processedFiles = true;        

        //check if it's time to commit after previous processing
        checkRunCommitSearch();
@ -201,7 +201,6 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
        //index the file and content (if the content is supported)
        indexer.indexFile(abstractFile, true);

-
        return ProcessResult.OK;
    }

@ -501,7 +500,9 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
        int text_ingested = 0;
        int metadata_ingested = 0;
        int strings_ingested = 0;
-        int skipped = 0;
+        int error_text = 0;
+        int error_index = 0;
+        int error_io = 0;
        for (IngestStatus s : ingestStatus.values()) {
            switch (s) {
                case TEXT_INGESTED:
@ -513,8 +514,14 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
                case STRINGS_INGESTED:
                    ++strings_ingested;
                    break;
-                case SKIPPED:
-                    ++skipped;
+                case SKIPPED_ERROR_TEXTEXTRACT:
+                    error_text++;
+                    break;
+                case SKIPPED_ERROR_INDEXING:
+                    error_index++;
+                    break;
+                case SKIPPED_ERROR_IO:
+                    error_io++;
                    break;
                default:
                    ;
@ -525,11 +532,19 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
        msg.append("<table border=0><tr><td>Files with known types</td><td>").append(text_ingested).append("</td></tr>");
        msg.append("<tr><td>Files with general strings extracted</td><td>").append(strings_ingested).append("</td></tr>");
        msg.append("<tr><td>Metadata only was indexed</td><td>").append(metadata_ingested).append("</td></tr>");
-        msg.append("<tr><td>Skipped files</td><td>").append(skipped).append("</td></tr>");
+        msg.append("<tr><td>Error (indexer)</td><td>").append(error_index).append("</td></tr>");
+        msg.append("<tr><td>Error (text extraction)</td><td>").append(error_text).append("</td></tr>");
+        msg.append("<tr><td>Error (I/O)</td><td>").append(error_io).append("</td></tr>");
        msg.append("</table>");
        String indexStats = msg.toString();
        logger.log(Level.INFO, "Keyword Indexing Completed: " + indexStats);
        services.postMessage(IngestMessage.createMessage(++messageID, MessageType.INFO, this, "Keyword Indexing Results", indexStats));
+        if (error_index > 0) {
+            MessageNotifyUtil.Notify.error("Keyword Indexing Errors", "Keyword index service had errors ingesting " + error_index + " files.");
+        }
+        else if (error_io + error_text > 0) {
+            MessageNotifyUtil.Notify.warn("Keyword Indexing Warning", "Keyword index service had errors reading files and extracting text. Could have been from corrupt media or files.");
+        }
    }

    /**
@ -707,12 +722,12 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
                    return true;
                } else {
                    logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
-                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
                    return false;
                }
            } catch (IngesterException ex) {
                logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex);
-                ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
                return false;
            }
        }
@ -746,13 +761,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
        private void indexFile(AbstractFile aFile, boolean indexContent) {
            //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());

-            //check its database file type
            TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType(); 
-            if (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
-                //skip indexing of virtual dirs (no content, no real name) - will index children files
-                return;
-            } // unallocated and unused blocks can only have strings extracted from them. 
-            else if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
+            
+            // unallocated and unused blocks can only have strings extracted from them. 
+            if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
                extractStringsAndIndex(aFile);
            }

@ -762,8 +774,9 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
                try {
                    ingester.ingest(aFile, false); //meta-data only
                    ingestStatus.put(aFile.getId(), IngestStatus.METADATA_INGESTED);
-                } catch (IngesterException ex) {
-                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                } 
+                catch (IngesterException ex) {
+                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
                    logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex);
                }
                return;
@ -775,10 +788,11 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
            try {
                is = new ReadContentInputStream(aFile);
                detectedFormat = tikaFormatDetector.detect(is, aFile.getName());
-
-            } catch (Exception e) {
+            } 
+            catch (Exception e) {
                logger.log(Level.WARNING, "Could not detect format using tika for file: " + aFile, e);
-            } finally {
+            } 
+            finally {
                if (is != null) {
                    try {
                        is.close();
@ -788,24 +802,33 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
                    }
                }
            }
+            
+            // @@@ Add file type signature to blackboard here
+            
            //logger.log(Level.INFO, "Detected format: " + aFile.getName() + " " + detectedFormat);

            // we skip archive formats that are opened by the archive module. 
            // @@@ We could have a check here to see if the archive module was enabled though...
            if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
-                ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                try {
+                    ingester.ingest(aFile, false); //meta-data only
+                    ingestStatus.put(aFile.getId(), IngestStatus.METADATA_INGESTED);
+                } 
+                catch (IngesterException ex) {
+                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
+                    logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex);
+                }
                return;
            }

-            boolean extractTextSupported = isTextExtractSupported(aFile, detectedFormat);
            boolean wasTextAdded = false;
-            if (extractTextSupported) {
+            if (isTextExtractSupported(aFile, detectedFormat)) {
                //extract text with one of the extractors, divide into chunks and index with Solr
                try {
                    //logger.log(Level.INFO, "indexing: " + aFile.getName());
                    if (!extractTextAndIndex(aFile, detectedFormat)) {
                        logger.log(Level.WARNING, "Failed to extract text and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
-                        ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                        ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
                    } else {
                        ingestStatus.put(aFile.getId(), IngestStatus.TEXT_INGESTED);
                        wasTextAdded = true;
@ -814,11 +837,11 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
                } catch (IngesterException e) {
                    logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", "
                            + aFile.getName(), e);
-                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
                } catch (Exception e) {
                    logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", "
                            + aFile.getName(), e);
-                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+                    ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
                }
            }

--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermComponentQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TermComponentQuery.java
@ -227,6 +227,7 @@ public class TermComponentQuery implements KeywordSearchQuery {
        final SolrQuery q = createQuery();
        q.setShowDebugInfo(DEBUG);
        q.setTermsLimit(MAX_TERMS_RESULTS); 
+        logger.log(Level.INFO, "Query: " + q.toString());
        terms = executeQuery(q);

        int resultSize = 0;
--- a/NEWS.txt
+++ b/NEWS.txt
@ -1,3 +1,12 @@
+
+---------------- VERSION 3.0.9  --------------
+Bug Fixes:
+- Regular expression keyword search works on file names.
+
+Improvements:
+- Enhanced reporting on keyword search module errors
+
+
 ---------------- VERSION 3.0.8  --------------
 Bug Fixes:
 - Fixed installer bug on Windows. No other code changes.