Updated parser with better error handling.

This commit is contained in:
Jeff Wallace 2013-11-14 14:12:57 -05:00
parent b236754ab4
commit 2fbea9cd8d
8 changed files with 204 additions and 67 deletions

View File

@ -1,3 +1,8 @@
file.reference.apache-mime4j-core-0.8.0-SNAPSHOT-sources.jar=release/modules/ext/apache-mime4j-core-0.8.0-SNAPSHOT-sources.jar
file.reference.apache-mime4j-core-0.8.0-SNAPSHOT.jar=release/modules/ext/apache-mime4j-core-0.8.0-SNAPSHOT.jar
file.reference.apache-mime4j-dom-0.8.0-SNAPSHOT-sources.jar=release/modules/ext/apache-mime4j-dom-0.8.0-SNAPSHOT-sources.jar
file.reference.apache-mime4j-dom-0.8.0-SNAPSHOT.jar=release/modules/ext/apache-mime4j-dom-0.8.0-SNAPSHOT.jar
file.reference.apache-mime4j-project-0.8.0-SNAPSHOT.jar=release/modules/ext/apache-mime4j-project-0.8.0-SNAPSHOT.jar
file.reference.java-libpst-1.0-SNAPSHOT.jar=release/modules/ext/java-libpst-1.0-SNAPSHOT.jar file.reference.java-libpst-1.0-SNAPSHOT.jar=release/modules/ext/java-libpst-1.0-SNAPSHOT.jar
javac.source=1.7 javac.source=1.7
javac.compilerargs=-Xlint -Xlint:-serial javac.compilerargs=-Xlint -Xlint:-serial

View File

@ -58,10 +58,6 @@
<runtime-relative-path>ext/apache-mime4j-mbox-iterator-0.8.0-SNAPSHOT-sources.jar</runtime-relative-path> <runtime-relative-path>ext/apache-mime4j-mbox-iterator-0.8.0-SNAPSHOT-sources.jar</runtime-relative-path>
<binary-origin>release/modules/ext/apache-mime4j-mbox-iterator-0.8.0-SNAPSHOT-sources.jar</binary-origin> <binary-origin>release/modules/ext/apache-mime4j-mbox-iterator-0.8.0-SNAPSHOT-sources.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/apache-mime4j-project-0.8.0-SNAPSHOT-tests.jar</runtime-relative-path>
<binary-origin>release/modules/ext/apache-mime4j-project-0.8.0-SNAPSHOT-tests.jar</binary-origin>
</class-path-extension>
</data> </data>
</configuration> </configuration>
</project> </project>

View File

@ -53,7 +53,9 @@ public class EmailMessage {
} }
void setRecipients(String recipients) { void setRecipients(String recipients) {
this.recipients = recipients; if (recipients != null) {
this.recipients = recipients;
}
} }
String getSender() { String getSender() {
@ -61,7 +63,9 @@ public class EmailMessage {
} }
void setSender(String sender) { void setSender(String sender) {
this.sender = sender; if (sender != null) {
this.sender = sender;
}
} }
String getSubject() { String getSubject() {
@ -69,7 +73,9 @@ public class EmailMessage {
} }
void setSubject(String subject) { void setSubject(String subject) {
this.subject = subject; if (subject != null) {
this.subject = subject;
}
} }
String getTextBody() { String getTextBody() {
@ -77,7 +83,9 @@ public class EmailMessage {
} }
void setTextBody(String textBody) { void setTextBody(String textBody) {
this.textBody = textBody; if (textBody != null) {
this.textBody = textBody;
}
} }
String getHtmlBody() { String getHtmlBody() {
@ -85,7 +93,9 @@ public class EmailMessage {
} }
void setHtmlBody(String htmlBody) { void setHtmlBody(String htmlBody) {
this.htmlBody = htmlBody; if (htmlBody != null) {
this.htmlBody = htmlBody;
}
} }
String getRtfBody() { String getRtfBody() {
@ -93,7 +103,9 @@ public class EmailMessage {
} }
void setRtfBody(String rtfBody) { void setRtfBody(String rtfBody) {
this.rtfBody = rtfBody; if (rtfBody != null) {
this.rtfBody = rtfBody;
}
} }
long getSentDate() { long getSentDate() {
@ -101,7 +113,9 @@ public class EmailMessage {
} }
void setSentDate(Date sentDate) { void setSentDate(Date sentDate) {
this.sentDate = sentDate.getTime() / 1000; if (sentDate != null) {
this.sentDate = sentDate.getTime() / 1000;
}
} }
void setSentDate(long sentDate) { void setSentDate(long sentDate) {
@ -113,7 +127,9 @@ public class EmailMessage {
} }
void setBcc(String bcc) { void setBcc(String bcc) {
this.bcc = bcc; if (bcc != null) {
this.bcc = bcc;
}
} }
String getCc() { String getCc() {
@ -121,7 +137,9 @@ public class EmailMessage {
} }
void setCc(String cc) { void setCc(String cc) {
this.cc = cc; if (cc != null) {
this.cc = cc;
}
} }
void addAttachment(Attachment a) { void addAttachment(Attachment a) {
@ -146,7 +164,9 @@ public class EmailMessage {
} }
void setLocalPath(String localPath) { void setLocalPath(String localPath) {
this.localPath = localPath; if (localPath != null) {
this.localPath = localPath;
}
} }
} }
@ -170,7 +190,9 @@ class Attachment {
} }
void setName(String name) { void setName(String name) {
this.name = name; if (name != null) {
this.name = name;
}
} }
String getLocalPath() { String getLocalPath() {
@ -178,7 +200,9 @@ class Attachment {
} }
void setLocalPath(String localPath) { void setLocalPath(String localPath) {
this.localPath = localPath; if (localPath != null) {
this.localPath = localPath;
}
} }
long getSize() { long getSize() {
@ -198,7 +222,9 @@ class Attachment {
} }
void setCrTime(Date crTime) { void setCrTime(Date crTime) {
this.crTime = crTime.getTime() / 1000; if (crTime != null) {
this.crTime = crTime.getTime() / 1000;
}
} }
long getcTime() { long getcTime() {
@ -210,7 +236,9 @@ class Attachment {
} }
void setcTime(Date cTime) { void setcTime(Date cTime) {
this.cTime = cTime.getTime() / 1000; if (cTime != null) {
this.cTime = cTime.getTime() / 1000;
}
} }
long getaTime() { long getaTime() {
@ -222,7 +250,9 @@ class Attachment {
} }
void setaTime(Date aTime) { void setaTime(Date aTime) {
this.aTime = aTime.getTime() / 1000; if (aTime != null) {
this.aTime = aTime.getTime() / 1000;
}
} }
long getmTime() { long getmTime() {
@ -234,6 +264,8 @@ class Attachment {
} }
void setmTime(Date mTime) { void setmTime(Date mTime) {
this.mTime = mTime.getTime() / 1000; if (mTime != null) {
this.mTime = mTime.getTime() / 1000;
}
} }
} }

View File

@ -18,22 +18,29 @@
*/ */
package org.sleuthkit.autopsy.thunderbirdparser; package org.sleuthkit.autopsy.thunderbirdparser;
import java.io.BufferedInputStream;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.CharConversionException;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder; import java.nio.charset.CharsetEncoder;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.dom.BinaryBody; import org.apache.james.mime4j.dom.BinaryBody;
import org.apache.james.mime4j.dom.Body;
import org.apache.james.mime4j.dom.Entity; import org.apache.james.mime4j.dom.Entity;
import org.apache.james.mime4j.dom.Message; import org.apache.james.mime4j.dom.Message;
import org.apache.james.mime4j.dom.MessageBuilder;
import org.apache.james.mime4j.dom.Multipart; import org.apache.james.mime4j.dom.Multipart;
import org.apache.james.mime4j.dom.TextBody; import org.apache.james.mime4j.dom.TextBody;
import org.apache.james.mime4j.dom.address.AddressList; import org.apache.james.mime4j.dom.address.AddressList;
@ -44,6 +51,10 @@ import org.apache.james.mime4j.dom.field.ContentTypeField;
import org.apache.james.mime4j.mboxiterator.CharBufferWrapper; import org.apache.james.mime4j.mboxiterator.CharBufferWrapper;
import org.apache.james.mime4j.mboxiterator.MboxIterator; import org.apache.james.mime4j.mboxiterator.MboxIterator;
import org.apache.james.mime4j.message.DefaultMessageBuilder; import org.apache.james.mime4j.message.DefaultMessageBuilder;
import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.autopsy.ingest.IngestServices;
/** /**
* A parser that extracts information about email messages and attachments from * A parser that extracts information about email messages and attachments from
@ -53,7 +64,9 @@ import org.apache.james.mime4j.message.DefaultMessageBuilder;
*/ */
public class MboxParser { public class MboxParser {
private static final Logger logger = Logger.getLogger(MboxParser.class.getName()); private static final Logger logger = Logger.getLogger(MboxParser.class.getName());
private MessageBuilder messageBuilder; private DefaultMessageBuilder messageBuilder;
private IngestServices services;
/** /**
* The mime type string for html text. * The mime type string for html text.
*/ */
@ -64,9 +77,13 @@ public class MboxParser {
*/ */
private String localPath; private String localPath;
MboxParser(String localPath) { MboxParser(IngestServices services, String localPath) {
this.services = services;
this.localPath = localPath; this.localPath = localPath;
messageBuilder = new DefaultMessageBuilder(); messageBuilder = new DefaultMessageBuilder();
MimeConfig config = MimeConfig.custom().setMaxLineLen(-1).build();
// disable line length checks.
messageBuilder.setMimeEntityConfig(config);
} }
static boolean isValidMimeTypeMbox(byte[] buffer) { static boolean isValidMimeTypeMbox(byte[] buffer) {
@ -79,24 +96,50 @@ public class MboxParser {
* @return a list of the email messages in the mbox file. * @return a list of the email messages in the mbox file.
*/ */
List<EmailMessage> parse(File mboxFile) { List<EmailMessage> parse(File mboxFile) {
//JWTODO: detect charset // Detect possible charsets
CharsetEncoder encoder = StandardCharsets.ISO_8859_1.newEncoder(); List<CharsetEncoder> encoders = getPossibleEncoders(mboxFile);
List<EmailMessage> emails = new ArrayList<>();
try { CharsetEncoder theEncoder = null;
for (CharBufferWrapper message : MboxIterator.fromFile(mboxFile).charset(encoder.charset()).build()) { Iterable<CharBufferWrapper> mboxIterator = null;
try { // Loop through the possible encoders and find the first one that works.
Message msg = messageBuilder.parseMessage(message.asInputStream(encoder.charset())); // That will usually be one of the first ones.
emails.add(extractEmail(msg)); for (CharsetEncoder encoder : encoders) {
} catch (MimeException ex) { try {
logger.log(Level.WARNING, "Failed to get message from mbox.", ex); mboxIterator = MboxIterator.fromFile(mboxFile).charset(encoder.charset()).build();
} theEncoder = encoder;
break;
} catch (CharConversionException | UnsupportedCharsetException ex) {
// Not the right encoder
} catch (IllegalArgumentException ex) {
// Not the right encoder
} catch (IOException ex) {
logger.log(Level.WARNING, "couldn't find mbox file.", ex);
//JWTODO: post inbox message
return Collections.EMPTY_LIST;
} }
} catch (FileNotFoundException ex) {
logger.log(Level.WARNING, "couldn't find mbox file.", ex);
} catch (IOException ex) {
logger.log(Level.WARNING, "Error getting messsages from mbox file.");
} }
// If no encoders work, post an error message and return.
if (mboxIterator == null || theEncoder == null) {
//JWTODO: post inbox message
return Collections.EMPTY_LIST;
}
List<EmailMessage> emails = new ArrayList<>();
long failCount = 0;
// Parse each message and extract an EmailMessage structure
for (CharBufferWrapper message : mboxIterator) {
try {
Message msg = messageBuilder.parseMessage(message.asInputStream(theEncoder.charset()));
emails.add(extractEmail(msg));
} catch (IOException ex) {
logger.log(Level.WARNING, "Failed to get message from mbox: " + ex.getMessage());
failCount++;
}
}
//JWTODO: post inbox message w/ fail count
return emails; return emails;
} }
@ -133,6 +176,7 @@ public class MboxParser {
* Recursively calls handleMultipart if one of the body parts is another * Recursively calls handleMultipart if one of the body parts is another
* multipart. Otherwise, calls the correct method to extract information out * multipart. Otherwise, calls the correct method to extract information out
* of each part of the body. * of each part of the body.
*
* @param email * @param email
* @param multi * @param multi
*/ */
@ -147,7 +191,7 @@ public class MboxParser {
e.getMimeType().equals(ContentTypeField.TYPE_TEXT_PLAIN)) { e.getMimeType().equals(ContentTypeField.TYPE_TEXT_PLAIN)) {
handleTextBody(email, (TextBody) e.getBody(), e.getMimeType()); handleTextBody(email, (TextBody) e.getBody(), e.getMimeType());
} else { } else {
logger.log(Level.INFO, "Found unrecognized entity: " + e); // Ignore other types.
} }
} }
} }
@ -179,7 +223,8 @@ public class MboxParser {
email.setHtmlBody(bodyString.toString()); email.setHtmlBody(bodyString.toString());
break; break;
default: default:
logger.log(Level.INFO, "Found unrecognized mime type: " + type); // Not interested in other text types.
break;
} }
} catch (IOException ex) { } catch (IOException ex) {
logger.log(Level.WARNING, "Error getting text body of mbox message", ex); logger.log(Level.WARNING, "Error getting text body of mbox message", ex);
@ -195,21 +240,30 @@ public class MboxParser {
private void handleAttachment(EmailMessage email, Entity e) { private void handleAttachment(EmailMessage email, Entity e) {
String outputDirPath = ThunderbirdMboxFileIngestModule.getModuleOutputPath() + File.separator; String outputDirPath = ThunderbirdMboxFileIngestModule.getModuleOutputPath() + File.separator;
String filename = e.getFilename(); String filename = e.getFilename();
String outPath = outputDirPath + filename; String uniqueFilename = filename + "-" + email.getSentDate();
String outPath = outputDirPath + uniqueFilename;
FileOutputStream fos; FileOutputStream fos;
BinaryBody bb; BinaryBody bb;
try { try {
fos = new FileOutputStream(outPath); fos = new FileOutputStream(outPath);
} catch (FileNotFoundException ex) { } catch (FileNotFoundException ex) {
logger.log(Level.INFO, "", ex); //JWTODO: post ingest message
logger.log(Level.INFO, "Failed to create file output stream for: " + outPath, ex);
return; return;
} }
try { try {
bb = (BinaryBody) e.getBody(); Body b = e.getBody();
bb.writeTo(fos); if (b instanceof BinaryBody) {
bb = (BinaryBody) b;
bb.writeTo(fos);
} else {
// This could potentially be other types. Only seen this once.
}
} catch (IOException ex) { } catch (IOException ex) {
logger.log(Level.INFO, "", ex); logger.log(Level.INFO, "Failed to write mbox email attachment to disk.", ex);
//JWTODO: post ingest message.
return; return;
} finally { } finally {
try { try {
@ -222,15 +276,8 @@ public class MboxParser {
Attachment attach = new Attachment(); Attachment attach = new Attachment();
attach.setName(filename); attach.setName(filename);
attach.setLocalPath(ThunderbirdMboxFileIngestModule.getRelModuleOutputPath() attach.setLocalPath(ThunderbirdMboxFileIngestModule.getRelModuleOutputPath()
+ File.separator + filename); + File.separator + uniqueFilename);
// JWTODO: find appropriate constant or make one. attach.setSize(new File(outPath).length());
// ContentDispositionField disposition = (ContentDispositionField) e.getHeader().getField("Content-Disposition");
// if (disposition != null) {
// attach.setSize(disposition.getSize());
// attach.setCrTime(disposition.getCreationDate());
// attach.setmTime(disposition.getModificationDate());
// attach.setaTime(disposition.getReadDate());
// }
email.addAttachment(attach); email.addAttachment(attach);
} }
@ -260,4 +307,52 @@ public class MboxParser {
private String getAddresses(AddressList addressList) { private String getAddresses(AddressList addressList) {
return (addressList == null) ? "" : getAddresses(addressList.flatten()); return (addressList == null) ? "" : getAddresses(addressList.flatten());
} }
/**
* Get a list of the possible encoders for the given mboxFile using Tika's
* CharsetDetector. At a minimum, returns the standard built in charsets.
* @param mboxFile
* @return
*/
private List<CharsetEncoder> getPossibleEncoders(File mboxFile) {
InputStream is;
List<CharsetEncoder> possibleEncoders = new ArrayList<>();
possibleEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
possibleEncoders.add(StandardCharsets.US_ASCII.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_16.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_16BE.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_16LE.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_8.newEncoder());
try {
is = new BufferedInputStream(new FileInputStream(mboxFile));
} catch (FileNotFoundException ex) {
logger.log(Level.WARNING, "Failed to find mbox file while detecting charset");
return possibleEncoders;
}
try {
CharsetDetector detector = new CharsetDetector();
detector.setText(is);
CharsetMatch[] matches = detector.detectAll();
for (CharsetMatch match : matches) {
try {
possibleEncoders.add(Charset.forName(match.getName()).newEncoder());
} catch (UnsupportedCharsetException | IllegalCharsetNameException ex) {
// Don't add unsupported charsets to the list
}
}
return possibleEncoders;
} catch (IOException | IllegalArgumentException ex) {
logger.log(Level.WARNING, "Failed to detect charset of mbox file.", ex);
return possibleEncoders;
} finally {
try {
is.close();
} catch (IOException ex) {
logger.log(Level.INFO, "Failed to close input stream");
}
}
}
} }

View File

@ -45,14 +45,17 @@ import org.sleuthkit.datamodel.TskCoreException;
*/ */
public class PstParser { public class PstParser {
private static final Logger logger = Logger.getLogger(PstParser.class.getName()); private static final Logger logger = Logger.getLogger(PstParser.class.getName());
/**
* First four bytes of a pst file.
*/
private static int PST_HEADER = 0x2142444E; private static int PST_HEADER = 0x2142444E;
private IngestServices services;
/** /**
* A map of PSTMessages to their Local path within the file's internal * A map of PSTMessages to their Local path within the file's internal
* directory structure. * directory structure.
*/ */
private List<EmailMessage> results; private List<EmailMessage> results;
private IngestServices services;
PstParser(IngestServices services) { PstParser(IngestServices services) {
results = new ArrayList<>(); results = new ArrayList<>();
this.services = services; this.services = services;
@ -70,9 +73,11 @@ public class PstParser {
*/ */
ParseResult parse(File file) { ParseResult parse(File file) {
PSTFile pstFile; PSTFile pstFile;
long failures = 0L;
try { try {
pstFile = new PSTFile(file); pstFile = new PSTFile(file);
processFolder(pstFile.getRootFolder(), "\\", true); failures = processFolder(pstFile.getRootFolder(), "\\", true);
//JWTODO: post ingest message if failures.
return ParseResult.OK; return ParseResult.OK;
} catch (PSTException | IOException ex) { } catch (PSTException | IOException ex) {
String msg = file.getName() + ": Failed to create internal java-libpst PST file to parse:\n" + ex.getMessage(); String msg = file.getName() + ": Failed to create internal java-libpst PST file to parse:\n" + ex.getMessage();
@ -102,9 +107,9 @@ public class PstParser {
* @throws PSTException * @throws PSTException
* @throws IOException * @throws IOException
*/ */
private void processFolder(PSTFolder folder, String path, boolean root) { private long processFolder(PSTFolder folder, String path, boolean root) {
String newPath = (root ? path : path + "\\" + folder.getDisplayName()); String newPath = (root ? path : path + "\\" + folder.getDisplayName());
long failCount = 0L; // Number of emails that failed
if (folder.hasSubfolders()) { if (folder.hasSubfolders()) {
List<PSTFolder> subFolders; List<PSTFolder> subFolders;
try { try {
@ -115,7 +120,7 @@ public class PstParser {
} }
for (PSTFolder f : subFolders) { for (PSTFolder f : subFolders) {
processFolder(f, newPath, false); failCount += processFolder(f, newPath, false);
} }
} }
@ -127,9 +132,12 @@ public class PstParser {
results.add(extractEmailMessage(email, newPath)); results.add(extractEmailMessage(email, newPath));
} }
} catch (PSTException | IOException ex) { } catch (PSTException | IOException ex) {
failCount++;
logger.log(Level.INFO, "java-libpst exception while getting emails from a folder: " + ex.getMessage()); logger.log(Level.INFO, "java-libpst exception while getting emails from a folder: " + ex.getMessage());
} }
} }
return failCount;
} }
/** /**
@ -186,15 +194,15 @@ public class PstParser {
if (filename.isEmpty()) { if (filename.isEmpty()) {
filename = attach.getFilename(); filename = attach.getFilename();
} }
filename = msg.getDescriptorNodeId() + "-" + filename; String uniqueFilename = msg.getDescriptorNodeId() + "-" + filename;
String outPath = outputDirPath + filename; String outPath = outputDirPath + uniqueFilename;
saveAttachmentToDisk(attach, outPath); saveAttachmentToDisk(attach, outPath);
Attachment attachment = new Attachment(); Attachment attachment = new Attachment();
long crTime = attach.getCreationTime().getTime() / 1000; long crTime = attach.getCreationTime().getTime() / 1000;
long mTime = attach.getModificationTime().getTime() / 1000; long mTime = attach.getModificationTime().getTime() / 1000;
String relPath = getRelModuleOutputPath() + File.separator + filename; String relPath = getRelModuleOutputPath() + File.separator + uniqueFilename;
attachment.setName(filename); attachment.setName(filename);
attachment.setCrTime(crTime); attachment.setCrTime(crTime);
attachment.setmTime(mTime); attachment.setmTime(mTime);
@ -202,7 +210,8 @@ public class PstParser {
attachment.setSize(attach.getFilesize()); attachment.setSize(attach.getFilesize());
email.addAttachment(attachment); email.addAttachment(attachment);
} catch (PSTException | IOException ex) { } catch (PSTException | IOException ex) {
logger.log(Level.WARNING, "Failed to extract attachment.", ex); //JWTODO post ingest message
logger.log(Level.WARNING, "Failed to extract attachment from pst file.", ex);
} }
} }
} }
@ -268,7 +277,7 @@ public class PstParser {
ByteBuffer bb = ByteBuffer.wrap(buffer); ByteBuffer bb = ByteBuffer.wrap(buffer);
return bb.getInt() == PST_HEADER; return bb.getInt() == PST_HEADER;
} catch (TskCoreException ex) { } catch (TskCoreException ex) {
System.out.println("Exception"); logger.log(Level.WARNING, "Exception while detecting if a file is a pst file.");
return false; return false;
} }
} }

View File

@ -215,7 +215,7 @@ public class ThunderbirdMboxFileIngestModule extends IngestModuleAbstractFile {
return ProcessResult.OK; return ProcessResult.OK;
} }
MboxParser parser = new MboxParser(emailFolder); MboxParser parser = new MboxParser(services, emailFolder);
List<EmailMessage> emails = parser.parse(file); List<EmailMessage> emails = parser.parse(file);
processEmails(emails, abstractFile, ingestContext); processEmails(emails, abstractFile, ingestContext);