This commit is contained in:
Jeff Wallace 2013-10-30 10:08:33 -04:00
commit f47aac4056
6 changed files with 96 additions and 53 deletions

View File

@ -11,8 +11,7 @@ correct C libraries.
STEPS:
1) Get Java Setup
1a) Download and install 32-bit version of JDK version 1.7 (32-bit is currently
needed even if you have a 64-bit system).
1a) Download and install JDK version 1.7. You can now use 32-bit or 64-bit, but special work is needed to get The Sleuth Kit to compile as 64-bit. So, 32-bit is easier.
Autopsy has been used and tested with Oracle JavaSE and the included JavaFX support
(http://www.oracle.com/technetwork/java/javase/downloads/index.html).
@ -26,7 +25,8 @@ Note: Netbeans IDE is not required to build and run Autopsy,
but it is a recommended IDE to use for development of Autopsy modules.
1d) (optional) If you are going to package Autopsy, then you'll also
need to set JRE_HOME to the root JRE directory.
need to set JRE_HOME_32 to the root 32-bit JRE directory and/or JRE_HOME_64
to the root 64-bit JRE directory.
1e) (optional) For some Autopsy features to be functional, you need to add java executable to the system PATH.
@ -37,6 +37,9 @@ need to set JRE_HOME to the root JRE directory.
later). All you need is the dll file. Note that you will get a
launching error if you use libewf 1.
- http://sourceforge.net/projects/libewf/
If you want to build the 64-bit version of The Sleuth Kit, download
our 64-bit version of libewf:
- https://github.com/sleuthkit/libewf_64bit
2b) Set LIBEWF_HOME environment variable to root directory of LIBEWF
@ -97,13 +100,13 @@ BACKGROUND:
Here are some notes to shed some light on what is going on during
the build process.
- NetBeans uses ant to build Autopsy. The build target locates TSK
(and LIBEWF) based on the environment variables and copies the
needed JAR and library files into the DataModel module in the Autopsy
project (see build-unix.xml and build-windows.xml in the root
directory for details). If you want to use the debug version of
the TSK dll, then edit the copy line in the build-windows.xml file
to copy from the Debug folder.
- The Sleuth Kit Java datamodel JAR file has native libraries
that are copied into it.
- NetBeans uses ant to build Autopsy. The build target copies the
TSK datamodel JAR file into the project. If you want to use the
debug version of the TSK dll, then there is a different ant target
in TSK to copy the debug versions of the dlls.
- On a Windows system, the ant target copies all needed libraries
to the autopsy folder. On a Unix system, the ant taget copies only

View File

@ -510,12 +510,13 @@
<!-- use image_id to easily search a specific image only -->
<field name="image_id" type="string" indexed="true" stored="false" required="true" />
<!-- Autopsy pushes text to the content field and gets the text to display from it. It is copied to other places -->
<!-- Autopsy pushes text to this field and gets the text to display from it. It is copied to other places -->
<field name="content" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<!-- The strings field holds strings extracted from files that SolrCell doesn't support -->
<!--<field name="strings" type="text_general" indexed="true" stored="true"/>-->
<!-- NOTE: file_name gets copied later to other fields for searching -->
<field name="file_name" type="text_general" indexed="false" stored="true"/>
<field name="ctime" type="tdate" indexed="false" stored="false"/>
<field name="atime" type="tdate" indexed="false" stored="false"/>
@ -555,7 +556,7 @@
<!-- field with white-space tokenized words for TermsComponent regex search (useful for fast search of IP addresses, URLs, certain phone numbers)
also be useful for Lucene based queries containing special characters-->
<!-- populated via copyField -->
<field name="content_ws" type="text_ws" indexed="true" stored="false" />
<field name="content_ws" type="text_ws" indexed="true" stored="false" multiValued="true" />
<!-- Uncommenting the following will create a "timestamp" field using
a default value of "NOW" to indicate when each document was indexed.
@ -628,7 +629,10 @@
<copyField source="file_name" dest="text"/>
<copyField source="meta" dest="text"/>
<!--<copyField source="strings" dest="text"/>-->
<copyField source="content" dest="content_ws"/>
<copyField source="file_name" dest="content_ws"/>
<copyField source="meta" dest="content_ws"/>
<!-- Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same

View File

@ -183,6 +183,9 @@ public class Ingester {
return fsc.accept(getContentFieldsV);
}
/**
* Visitor used to create param list to send to SOLR index.
*/
private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
private SleuthkitCase curCase = null;

View File

@ -27,7 +27,6 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -44,9 +43,9 @@ import org.netbeans.api.progress.aggregate.AggregateProgressFactory;
import org.netbeans.api.progress.aggregate.AggregateProgressHandle;
import org.netbeans.api.progress.aggregate.ProgressContributor;
import org.openide.util.Cancellable;
import org.openide.util.Exceptions;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
import org.sleuthkit.autopsy.coreutils.StopWatch;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.PipelineContext;
@ -61,8 +60,6 @@ import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Image;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException;
@ -137,8 +134,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private enum IngestStatus {
TEXT_INGESTED, /// Text was extracted by knowing file type and text_ingested
STRINGS_INGESTED, ///< Strings were extracted from file
SKIPPED, ///< File was skipped for whatever reason
METADATA_INGESTED ///< No content, so we just text_ingested metadata
METADATA_INGESTED, ///< No content, so we just text_ingested metadata
SKIPPED_ERROR_INDEXING, ///< File was skipped because index engine had problems
SKIPPED_ERROR_TEXTEXTRACT, ///< File was skipped because of text extraction issues
SKIPPED_ERROR_IO ///< File was skipped because of IO issues reading it
};
private Map<Long, IngestStatus> ingestStatus;
@ -164,7 +163,7 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
if (initialized == false) //error initializing indexing/Solr
{
logger.log(Level.WARNING, "Skipping processing, module not initialized, file: " + abstractFile.getName());
ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED);
ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
return ProcessResult.OK;
}
try {
@ -176,24 +175,25 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
logger.log(Level.SEVERE, "Error getting image id of file processed by keyword search: " + abstractFile.getName(), ex);
}
if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
//skip indexing of virtual dirs (no content, no real name) - will index children files
return ProcessResult.OK;
}
//check if we should index meta-data only when 1) it is known 2) HashDb module errored on it
IngestModuleAbstractFile.ProcessResult hashDBResult = services.getAbstractFileModuleResult(hashDBModuleName);
//logger.log(Level.INFO, "hashdb result: " + hashDBResult + "file: " + AbstractFile.getName());
if (hashDBResult == IngestModuleAbstractFile.ProcessResult.ERROR) {
//index meta-data only
if (services.getAbstractFileModuleResult(hashDBModuleName) == IngestModuleAbstractFile.ProcessResult.ERROR) {
indexer.indexFile(abstractFile, false);
//notify depending module that keyword search (would) encountered error for this file
ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED);
ingestStatus.put(abstractFile.getId(), IngestStatus.SKIPPED_ERROR_IO);
return ProcessResult.ERROR;
} else if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
}
else if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
//index meta-data only
indexer.indexFile(abstractFile, false);
return ProcessResult.OK;
}
if (processedFiles == false) {
processedFiles = true;
}
processedFiles = true;
//check if it's time to commit after previous processing
checkRunCommitSearch();
@ -201,7 +201,6 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
//index the file and content (if the content is supported)
indexer.indexFile(abstractFile, true);
return ProcessResult.OK;
}
@ -501,7 +500,9 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
int text_ingested = 0;
int metadata_ingested = 0;
int strings_ingested = 0;
int skipped = 0;
int error_text = 0;
int error_index = 0;
int error_io = 0;
for (IngestStatus s : ingestStatus.values()) {
switch (s) {
case TEXT_INGESTED:
@ -513,8 +514,14 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
case STRINGS_INGESTED:
++strings_ingested;
break;
case SKIPPED:
++skipped;
case SKIPPED_ERROR_TEXTEXTRACT:
error_text++;
break;
case SKIPPED_ERROR_INDEXING:
error_index++;
break;
case SKIPPED_ERROR_IO:
error_io++;
break;
default:
;
@ -525,11 +532,19 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
msg.append("<table border=0><tr><td>Files with known types</td><td>").append(text_ingested).append("</td></tr>");
msg.append("<tr><td>Files with general strings extracted</td><td>").append(strings_ingested).append("</td></tr>");
msg.append("<tr><td>Metadata only was indexed</td><td>").append(metadata_ingested).append("</td></tr>");
msg.append("<tr><td>Skipped files</td><td>").append(skipped).append("</td></tr>");
msg.append("<tr><td>Error (indexer)</td><td>").append(error_index).append("</td></tr>");
msg.append("<tr><td>Error (text extraction)</td><td>").append(error_text).append("</td></tr>");
msg.append("<tr><td>Error (I/O)</td><td>").append(error_io).append("</td></tr>");
msg.append("</table>");
String indexStats = msg.toString();
logger.log(Level.INFO, "Keyword Indexing Completed: " + indexStats);
services.postMessage(IngestMessage.createMessage(++messageID, MessageType.INFO, this, "Keyword Indexing Results", indexStats));
if (error_index > 0) {
MessageNotifyUtil.Notify.error("Keyword Indexing Errors", "Keyword index service had errors ingesting " + error_index + " files.");
}
else if (error_io + error_text > 0) {
MessageNotifyUtil.Notify.warn("Keyword Indexing Warning", "Keyword index service had errors reading files and extracting text. Could have been from corrupt media or files.");
}
}
/**
@ -707,12 +722,12 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
return true;
} else {
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
return false;
}
} catch (IngesterException ex) {
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
return false;
}
}
@ -746,13 +761,10 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
private void indexFile(AbstractFile aFile, boolean indexContent) {
//logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
//check its database file type
TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
if (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
//skip indexing of virtual dirs (no content, no real name) - will index children files
return;
} // unallocated and unused blocks can only have strings extracted from them.
else if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
// unallocated and unused blocks can only have strings extracted from them.
if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
extractStringsAndIndex(aFile);
}
@ -762,8 +774,9 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
try {
ingester.ingest(aFile, false); //meta-data only
ingestStatus.put(aFile.getId(), IngestStatus.METADATA_INGESTED);
} catch (IngesterException ex) {
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
}
catch (IngesterException ex) {
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex);
}
return;
@ -775,10 +788,11 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
try {
is = new ReadContentInputStream(aFile);
detectedFormat = tikaFormatDetector.detect(is, aFile.getName());
} catch (Exception e) {
}
catch (Exception e) {
logger.log(Level.WARNING, "Could not detect format using tika for file: " + aFile, e);
} finally {
}
finally {
if (is != null) {
try {
is.close();
@ -788,24 +802,33 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
}
}
}
// @@@ Add file type signature to blackboard here
//logger.log(Level.INFO, "Detected format: " + aFile.getName() + " " + detectedFormat);
// we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though...
if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
try {
ingester.ingest(aFile, false); //meta-data only
ingestStatus.put(aFile.getId(), IngestStatus.METADATA_INGESTED);
}
catch (IngesterException ex) {
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex);
}
return;
}
boolean extractTextSupported = isTextExtractSupported(aFile, detectedFormat);
boolean wasTextAdded = false;
if (extractTextSupported) {
if (isTextExtractSupported(aFile, detectedFormat)) {
//extract text with one of the extractors, divide into chunks and index with Solr
try {
//logger.log(Level.INFO, "indexing: " + aFile.getName());
if (!extractTextAndIndex(aFile, detectedFormat)) {
logger.log(Level.WARNING, "Failed to extract text and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
} else {
ingestStatus.put(aFile.getId(), IngestStatus.TEXT_INGESTED);
wasTextAdded = true;
@ -814,11 +837,11 @@ public final class KeywordSearchIngestModule extends IngestModuleAbstractFile {
} catch (IngesterException e) {
logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", "
+ aFile.getName(), e);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
} catch (Exception e) {
logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", "
+ aFile.getName(), e);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
}
}

View File

@ -227,6 +227,7 @@ public class TermComponentQuery implements KeywordSearchQuery {
final SolrQuery q = createQuery();
q.setShowDebugInfo(DEBUG);
q.setTermsLimit(MAX_TERMS_RESULTS);
logger.log(Level.INFO, "Query: " + q.toString());
terms = executeQuery(q);
int resultSize = 0;

View File

@ -1,3 +1,12 @@
---------------- VERSION 3.0.9 --------------
Bug Fixes:
- Regular expression keyword search works on file names.
Improvements:
- Enhanced reporting on keyword search module errors
---------------- VERSION 3.0.8 --------------
Bug Fixes:
- Fixed installer bug on Windows. No other code changes.