Implemented the second pass algorithm to improve mime type accuracy

This commit is contained in:
U-BASIS\dsmyda 2019-05-07 12:54:31 -04:00
parent 616ac03da4
commit aaa22f80d8

View File

@ -58,7 +58,7 @@ public class FileTypeDetector {
* @return A list of all detectable file types. * @return A list of all detectable file types.
* *
* @throws FileTypeDetectorInitException If an error occurs while assembling * @throws FileTypeDetectorInitException If an error occurs while assembling
* the list of types * the list of types
*/ */
public static synchronized SortedSet<String> getDetectedTypes() throws FileTypeDetectorInitException { public static synchronized SortedSet<String> getDetectedTypes() throws FileTypeDetectorInitException {
TreeSet<String> detectedTypes = new TreeSet<>((String string1, String string2) -> { TreeSet<String> detectedTypes = new TreeSet<>((String string1, String string2) -> {
@ -109,7 +109,9 @@ public class FileTypeDetector {
* Tika, and Autopsy file type definitions take precendence over Tika. * Tika, and Autopsy file type definitions take precendence over Tika.
* *
* @throws FileTypeDetectorInitException If an initialization error occurs, * @throws FileTypeDetectorInitException If an initialization error occurs,
* e.g., user-defined file type definitions exist but cannot be loaded. * e.g., user-defined file type
* definitions exist but cannot be
* loaded.
*/ */
public FileTypeDetector() throws FileTypeDetectorInitException { public FileTypeDetector() throws FileTypeDetectorInitException {
try { try {
@ -139,7 +141,7 @@ public class FileTypeDetector {
* user-defined MIME type by this detector. * user-defined MIME type by this detector.
* *
* @param customTypes * @param customTypes
* @param mimeType The MIME type name (e.g., "text/html"). * @param mimeType The MIME type name (e.g., "text/html").
* *
* @return True or false. * @return True or false.
*/ */
@ -170,7 +172,7 @@ public class FileTypeDetector {
* @param file The file to test. * @param file The file to test.
* *
* @return A MIME type name. If file type could not be detected, or results * @return A MIME type name. If file type could not be detected, or results
* were uncertain, octet-stream is returned. * were uncertain, octet-stream is returned.
* *
* *
*/ */
@ -223,7 +225,7 @@ public class FileTypeDetector {
ReadContentInputStream stream = new ReadContentInputStream(file); ReadContentInputStream stream = new ReadContentInputStream(file);
try (TikaInputStream tikaInputStream = TikaInputStream.get(stream)) { try (TikaInputStream tikaInputStream = TikaInputStream.get(stream)) {
String tikaType = tika.detect(tikaInputStream, file.getName()); String tikaType = tika.detect(tikaInputStream);
/* /*
* Remove the Tika suffix from the MIME type name. * Remove the Tika suffix from the MIME type name.
@ -233,6 +235,23 @@ public class FileTypeDetector {
* Remove the optional parameter from the MIME type. * Remove the optional parameter from the MIME type.
*/ */
mimeType = removeOptionalParameter(mimeType); mimeType = removeOptionalParameter(mimeType);
/*
* If Tika recognizes the file signature, then use the file
* name to refine the type. In short, this is to exclude the
* mime types that are determined solely by file extension.
* More details in JIRA-4871.
*/
if (!mimeType.equals(MimeTypes.OCTET_STREAM)) {
ReadContentInputStream secondPassStream = new ReadContentInputStream(file);
try (TikaInputStream secondPassTikaStream = TikaInputStream.get(secondPassStream)) {
tikaType = tika.detect(secondPassTikaStream, file.getName());
mimeType = tikaType.replace("tika-", ""); //NON-NLS
mimeType = removeOptionalParameter(mimeType);
} catch (Exception ex) {
throw ex;
}
}
/** /**
* We cannot trust Tika's audio/mpeg mimetype. Lets verify the * We cannot trust Tika's audio/mpeg mimetype. Lets verify the
@ -275,6 +294,7 @@ public class FileTypeDetector {
* first 4 bits. * first 4 bits.
* *
* @param x byte * @param x byte
*
* @return Flag indicating the byte if 0xFF * @return Flag indicating the byte if 0xFF
*/ */
private boolean byteIs0xFF(byte x) { private boolean byteIs0xFF(byte x) {
@ -284,9 +304,10 @@ public class FileTypeDetector {
/** /**
* Retrieves the first N bytes from a file. * Retrieves the first N bytes from a file.
* *
* @param file Abstract file to read * @param file Abstract file to read
* @param offset Offset to begin reading * @param offset Offset to begin reading
* @param n Number of bytes to read * @param n Number of bytes to read
*
* @return Byte array of size n * @return Byte array of size n
* *
* @throws TskCoreException * @throws TskCoreException
@ -371,7 +392,7 @@ public class FileTypeDetector {
* Constructs an exception to throw if an initialization error occurs, * Constructs an exception to throw if an initialization error occurs,
* e.g., user-defined file type definitions exist but cannot be loaded. * e.g., user-defined file type definitions exist but cannot be loaded.
* *
* @param message The exception message, * @param message The exception message,
* @param throwable The underlying cause of the exception. * @param throwable The underlying cause of the exception.
*/ */
FileTypeDetectorInitException(String message, Throwable throwable) { FileTypeDetectorInitException(String message, Throwable throwable) {
@ -409,7 +430,7 @@ public class FileTypeDetector {
* @return A MIME type name. * @return A MIME type name.
* *
* @throws TskCoreException if detection is required and there is a problem * @throws TskCoreException if detection is required and there is a problem
* writing the result to the case database. * writing the result to the case database.
* @deprecated Use getMIMEType instead, and call AbstractFile.setMIMEType * @deprecated Use getMIMEType instead, and call AbstractFile.setMIMEType
* and AbstractFile.save to save the result to the file object and the * and AbstractFile.save to save the result to the file object and the
* database. * database.
@ -429,10 +450,10 @@ public class FileTypeDetector {
* @param file The file. * @param file The file.
* *
* @return A MIME type name. If file type could not be detected or results * @return A MIME type name. If file type could not be detected or results
* were uncertain, octet-stream is returned. * were uncertain, octet-stream is returned.
* *
* @throws TskCoreException if detection is required and there is a problem * @throws TskCoreException if detection is required and there is a problem
* writing the result to the case database. * writing the result to the case database.
* *
* @deprecated Use getMIMEType instead, and call AbstractFile.setMIMEType * @deprecated Use getMIMEType instead, and call AbstractFile.setMIMEType
* and AbstractFile.save to save the result to the file object and the * and AbstractFile.save to save the result to the file object and the
@ -453,7 +474,7 @@ public class FileTypeDetector {
* @param file The file to test. * @param file The file to test.
* *
* @return A MIME type name. If file type could not be detected or results * @return A MIME type name. If file type could not be detected or results
* were uncertain, octet-stream is returned. * were uncertain, octet-stream is returned.
* *
* @throws TskCoreException * @throws TskCoreException
* @deprecated Use getMIMEType instead. * @deprecated Use getMIMEType instead.