Merge pull request #1266 from sidheshenator/file_type_identification

File type identification
This commit is contained in:
Richard Cordovano 2015-05-27 18:09:35 -04:00
commit e0614436a7
8 changed files with 91 additions and 64 deletions

View File

@ -6,3 +6,4 @@ OpenIDE-Module-Name=ExifParser
OpenIDE-Module-Short-Description=Exif metadata ingest module OpenIDE-Module-Short-Description=Exif metadata ingest module
ExifParserFileIngestModule.moduleName.text=Exif Parser ExifParserFileIngestModule.moduleName.text=Exif Parser
ExifParserFileIngestModule.getDesc.text=Ingests JPEG files and retrieves their EXIF metadata. ExifParserFileIngestModule.getDesc.text=Ingests JPEG files and retrieves their EXIF metadata.
ExifParserFileIngestModule.startUp.fileTypeDetectorInitializationException.msg=Error initializing the File Type Detector.

View File

@ -34,13 +34,14 @@ import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level; import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.ImageUtils; import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.ingest.FileIngestModule; import org.sleuthkit.autopsy.ingest.FileIngestModule;
import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.ingest.IngestServices; import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.ingest.ModuleDataEvent; import org.sleuthkit.autopsy.ingest.ModuleDataEvent;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter; import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute; import org.sleuthkit.datamodel.BlackboardAttribute;
@ -63,6 +64,7 @@ public final class ExifParserFileIngestModule implements FileIngestModule {
private volatile boolean filesToFire = false; private volatile boolean filesToFire = false;
private long jobId; private long jobId;
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter(); private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
private FileTypeDetector fileTypeDetector;
ExifParserFileIngestModule() { ExifParserFileIngestModule() {
} }
@ -71,6 +73,12 @@ public final class ExifParserFileIngestModule implements FileIngestModule {
public void startUp(IngestJobContext context) throws IngestModuleException { public void startUp(IngestJobContext context) throws IngestModuleException {
jobId = context.getJobId(); jobId = context.getJobId();
refCounter.incrementAndGet(jobId); refCounter.incrementAndGet(jobId);
try {
fileTypeDetector = new FileTypeDetector();
} catch (FileTypeDetector.FileTypeDetectorInitException ex) {
logger.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "ExifParserFileIngestModule.startUp.fileTypeDetectorInitializationException.msg"), ex);
throw new IngestModuleException(NbBundle.getMessage(this.getClass(), "ExifParserFileIngestModule.startUp.fileTypeDetectorInitializationException.msg"));
}
} }
@ -197,7 +205,7 @@ public final class ExifParserFileIngestModule implements FileIngestModule {
* @return true if to be processed * @return true if to be processed
*/ */
private boolean parsableFormat(AbstractFile f) { private boolean parsableFormat(AbstractFile f) {
return ImageUtils.isJpegFileHeader(f); return fileTypeDetector.getFileType(f).equals("image/jpeg");
} }
@Override @Override

View File

@ -18,15 +18,19 @@
*/ */
package org.sleuthkit.autopsy.modules.filetypeid; package org.sleuthkit.autopsy.modules.filetypeid;
import java.util.ArrayList;
import java.util.Map; import java.util.Map;
import java.util.SortedSet; import java.util.SortedSet;
import java.util.logging.Level;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.MimeTypes;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute; import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskData;
/** /**
* Detects the type of a file by an inspection of its contents. * Detects the type of a file by an inspection of its contents.
@ -37,6 +41,7 @@ public class FileTypeDetector {
private static final int BUFFER_SIZE = 64 * 1024; private static final int BUFFER_SIZE = 64 * 1024;
private final byte buffer[] = new byte[BUFFER_SIZE]; private final byte buffer[] = new byte[BUFFER_SIZE];
private final Map<String, FileType> userDefinedFileTypes; private final Map<String, FileType> userDefinedFileTypes;
private static final Logger logger = Logger.getLogger(FileTypeDetector.class.getName());
/** /**
* Constructs an object that detects the type of a file by an inspection of * Constructs an object that detects the type of a file by an inspection of
@ -93,17 +98,59 @@ public class FileTypeDetector {
return false; return false;
} }
/**
* This method returns a string representing the mimetype of the provided
* abstractFile. Blackboard-lookup is performed to check if the mimetype has
* been already detected. If not, mimetype is determined using Apache Tika.
*
* @param abstractFile the file whose mimetype is to be determined.
* @return mimetype of the abstractFile is returned. Empty String returned
* in case of error.
*/
public String getFileType(AbstractFile abstractFile) {
String identifiedFileType = "";
// check BB
try {
ArrayList<BlackboardAttribute> attributes = abstractFile.getGenInfoAttributes(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_FILE_TYPE_SIG);
for (BlackboardAttribute attribute : attributes) {
identifiedFileType = attribute.getValueString();
break;
}
if (identifiedFileType != null && !identifiedFileType.isEmpty()) {
return identifiedFileType;
}
} catch (TskCoreException ex) {
logger.log(Level.WARNING, "Error performing mimetype blackboard-lookup for " + abstractFile.getName(), ex);
}
try {
// check UDF and TDF
identifiedFileType = detectAndPostToBlackboard(abstractFile);
if (identifiedFileType != null && !identifiedFileType.isEmpty()) {
return identifiedFileType;
}
} catch (TskCoreException ex) {
logger.log(Level.WARNING, "Error determining the mimetype for " + abstractFile.getName(), ex); // NON-NLS
return ""; // NON-NLS
}
logger.log(Level.WARNING, "Unable to determine the mimetype for {0}", abstractFile.getName()); // NON-NLS
return ""; // NON-NLS
}
/** /**
* Detect the MIME type of a file, posting it to the blackboard if detection * Detect the MIME type of a file, posting it to the blackboard if detection
* succeeds. * succeeds.
* *
* @param file The file to test. * @param file The file to test.
* @param moduleName The name of the module posting to the blackboard.
* @return The MIME type name id detection was successful, null otherwise. * @return The MIME type name id detection was successful, null otherwise.
* @throws TskCoreException if there is an error posting to the blackboard. * @throws TskCoreException if there is an error posting to the blackboard.
*/ */
public synchronized String detectAndPostToBlackboard(AbstractFile file) throws TskCoreException { public String detectAndPostToBlackboard(AbstractFile file) throws TskCoreException {
String mimeType = detect(file);
String mimeType;
mimeType = detect(file);
if (null != mimeType) { if (null != mimeType) {
/** /**
* Add the file type attribute to the general info artifact. Note * Add the file type attribute to the general info artifact. Note
@ -125,6 +172,13 @@ public class FileTypeDetector {
* @return The MIME type name id detection was successful, null otherwise. * @return The MIME type name id detection was successful, null otherwise.
*/ */
public String detect(AbstractFile file) throws TskCoreException { public String detect(AbstractFile file) throws TskCoreException {
// Consistently mark unallocated and unused space as file type application/octet-stream
if ((file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
|| (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)
|| (file.isFile() == false)) {
return MimeTypes.OCTET_STREAM;
}
String fileType = detectUserDefinedType(file); String fileType = detectUserDefinedType(file);
if (null == fileType) { if (null == fileType) {
try { try {

View File

@ -27,7 +27,6 @@ import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.ingest.IngestMessage; import org.sleuthkit.autopsy.ingest.IngestMessage;
import org.sleuthkit.autopsy.ingest.IngestServices; import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown; import org.sleuthkit.datamodel.TskData.FileKnown;
import org.sleuthkit.autopsy.ingest.IngestModule.ProcessResult; import org.sleuthkit.autopsy.ingest.IngestModule.ProcessResult;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter; import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
@ -95,15 +94,6 @@ public class FileTypeIdIngestModule implements FileIngestModule {
@Override @Override
public ProcessResult process(AbstractFile file) { public ProcessResult process(AbstractFile file) {
/**
* Skip unallocated space and unused blocks files.
*/
if ((file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
|| (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)
|| (file.isFile() == false)) {
return ProcessResult.OK;
}
/** /**
* Skip known files if configured to do so. * Skip known files if configured to do so.
*/ */

View File

@ -29,3 +29,4 @@ SevenZipIngestModule.unpack.encrFileDetected.msg=Encrypted files in archive dete
SevenZipIngestModule.unpack.encrFileDetected.details=Some files in archive\: {0} are encrypted. {1} extractor was unable to extract all files from this archive. SevenZipIngestModule.unpack.encrFileDetected.details=Some files in archive\: {0} are encrypted. {1} extractor was unable to extract all files from this archive.
SevenZipIngestModule.UnpackStream.write.exception.msg=Error writing unpacked file to\: {0} SevenZipIngestModule.UnpackStream.write.exception.msg=Error writing unpacked file to\: {0}
SevenZipIngestModule.UnpackedTree.exception.msg=Error adding a derived file to db\:{0} SevenZipIngestModule.UnpackedTree.exception.msg=Error adding a derived file to db\:{0}
SevenZipIngestModule.startUp.fileTypeDetectorInitializationException.msg=Error initializing the File Type Detector.

View File

@ -24,7 +24,6 @@ import java.io.FileNotFoundException;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Date; import java.util.Date;
@ -62,6 +61,7 @@ import org.sleuthkit.autopsy.ingest.ModuleDataEvent;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter; import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import net.sf.sevenzipjbinding.ArchiveFormat; import net.sf.sevenzipjbinding.ArchiveFormat;
import static net.sf.sevenzipjbinding.ArchiveFormat.RAR; import static net.sf.sevenzipjbinding.ArchiveFormat.RAR;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
/** /**
* 7Zip ingest module extracts supported archives, adds extracted DerivedFiles, * 7Zip ingest module extracts supported archives, adds extracted DerivedFiles,
@ -87,13 +87,10 @@ public final class SevenZipIngestModule implements FileIngestModule {
private static final long MIN_FREE_DISK_SPACE = 1 * 1000 * 1000000L; //1GB private static final long MIN_FREE_DISK_SPACE = 1 * 1000 * 1000000L; //1GB
//counts archive depth //counts archive depth
private ArchiveDepthCountTree archiveDepthCountTree; private ArchiveDepthCountTree archiveDepthCountTree;
//buffer for checking file headers and signatures
private static final int readHeaderSize = 4;
private final byte[] fileHeaderBuffer = new byte[readHeaderSize];
private static final int ZIP_SIGNATURE_BE = 0x504B0304;
private IngestJobContext context; private IngestJobContext context;
private long jobId; private long jobId;
private final static IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter(); private final static IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
private FileTypeDetector fileTypeDetector;
SevenZipIngestModule() { SevenZipIngestModule() {
} }
@ -103,6 +100,13 @@ public final class SevenZipIngestModule implements FileIngestModule {
this.context = context; this.context = context;
jobId = context.getJobId(); jobId = context.getJobId();
try {
fileTypeDetector = new FileTypeDetector();
} catch (FileTypeDetector.FileTypeDetectorInitException ex) {
logger.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "SevenZipIngestModule.startUp.fileTypeDetectorInitializationException.msg"), ex);
throw new IngestModuleException(NbBundle.getMessage(this.getClass(), "SevenZipIngestModule.startUp.fileTypeDetectorInitializationException.msg"));
}
final Case currentCase = Case.getCurrentCase(); final Case currentCase = Case.getCurrentCase();
moduleDirRelative = Case.getModulesOutputDirRelPath() + File.separator + ArchiveFileExtractorModuleFactory.getModuleName(); moduleDirRelative = Case.getModulesOutputDirRelPath() + File.separator + ArchiveFileExtractorModuleFactory.getModuleName();
@ -657,24 +661,7 @@ public final class SevenZipIngestModule implements FileIngestModule {
* @return true if zip file, false otherwise * @return true if zip file, false otherwise
*/ */
private boolean isZipFileHeader(AbstractFile file) { private boolean isZipFileHeader(AbstractFile file) {
if (file.getSize() < readHeaderSize) { return fileTypeDetector.getFileType(file).equals("application/zip"); //NON-NLS
return false;
}
try {
int bytesRead = file.read(fileHeaderBuffer, 0, readHeaderSize);
if (bytesRead != readHeaderSize) {
return false;
}
} catch (TskCoreException ex) {
//ignore if can't read the first few bytes, not a ZIP
return false;
}
ByteBuffer bytes = ByteBuffer.wrap(fileHeaderBuffer);
int signature = bytes.getInt();
return signature == ZIP_SIGNATURE_BE;
} }
/** /**

View File

@ -283,3 +283,4 @@ KeywordSearchModuleFactory.createFileIngestModule.exception.msg=Expected setting
SearchRunner.Searcher.done.err.msg=Error performing keyword search SearchRunner.Searcher.done.err.msg=Error performing keyword search
KeywordSearchGlobalSearchSettingsPanel.timeRadioButton5.toolTipText=Fastest overall, but no results until the end KeywordSearchGlobalSearchSettingsPanel.timeRadioButton5.toolTipText=Fastest overall, but no results until the end
KeywordSearchGlobalSearchSettingsPanel.timeRadioButton5.text=No periodic searches KeywordSearchGlobalSearchSettingsPanel.timeRadioButton5.text=No periodic searches
KeywordSearchIngestModule.startUp.fileTypeDetectorInitializationException.msg=Error initializing the File Type Detector.

View File

@ -37,8 +37,6 @@ import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskData; import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown; import org.sleuthkit.datamodel.TskData.FileKnown;
@ -74,7 +72,8 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
private final IngestServices services = IngestServices.getInstance(); private final IngestServices services = IngestServices.getInstance();
private Ingester ingester = null; private Ingester ingester = null;
private Indexer indexer; private Indexer indexer;
//only search images from current ingest, not images previously ingested/indexed private FileTypeDetector fileTypeDetector;
//only search images from current ingest, not images previously ingested/indexed
//accessed read-only by searcher thread //accessed read-only by searcher thread
private boolean startedSearching = false; private boolean startedSearching = false;
@ -130,6 +129,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
jobId = context.getJobId(); jobId = context.getJobId();
dataSourceId = context.getDataSource().getId(); dataSourceId = context.getDataSource().getId();
try {
fileTypeDetector = new FileTypeDetector();
} catch (FileTypeDetector.FileTypeDetectorInitException ex) {
logger.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.startUp.fileTypeDetectorInitializationException.msg"), ex);
throw new IngestModuleException(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.startUp.fileTypeDetectorInitializationException.msg"));
}
ingester = Server.getIngester(); ingester = Server.getIngester();
this.context = context; this.context = context;
@ -470,30 +475,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
return; return;
} }
String detectedFormat = fileTypeDetector.getFileType(aFile);
// try to get the file type from the BB
String detectedFormat = null;
try {
ArrayList<BlackboardAttribute> attributes = aFile.getGenInfoAttributes(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_FILE_TYPE_SIG);
for (BlackboardAttribute attribute : attributes) {
detectedFormat = attribute.getValueString();
break;
}
} catch (TskCoreException ex) {
}
// else, use FileType module to detect the format
if (detectedFormat == null) { if (detectedFormat == null) {
try { logger.log(Level.WARNING, "Could not detect format using fileTypeDetector for file: {0}", aFile); //NON-NLS
detectedFormat = new FileTypeDetector().detectAndPostToBlackboard(aFile); return;
} catch (FileTypeDetector.FileTypeDetectorInitException | TskCoreException ex) {
logger.log(Level.WARNING, "Could not detect format using file type detector for file: {0}", aFile); //NON-NLS
return;
}
if (detectedFormat == null) {
logger.log(Level.WARNING, "Could not detect format using file type detector for file: {0}", aFile); //NON-NLS
return;
}
} }
// we skip archive formats that are opened by the archive module. // we skip archive formats that are opened by the archive module.