Updated parser with better error handling.

This commit is contained in:
Jeff Wallace 2013-11-14 14:12:57 -05:00
parent b236754ab4
commit 2fbea9cd8d
8 changed files with 204 additions and 67 deletions

View File

@ -1,3 +1,8 @@
file.reference.apache-mime4j-core-0.8.0-SNAPSHOT-sources.jar=release/modules/ext/apache-mime4j-core-0.8.0-SNAPSHOT-sources.jar
file.reference.apache-mime4j-core-0.8.0-SNAPSHOT.jar=release/modules/ext/apache-mime4j-core-0.8.0-SNAPSHOT.jar
file.reference.apache-mime4j-dom-0.8.0-SNAPSHOT-sources.jar=release/modules/ext/apache-mime4j-dom-0.8.0-SNAPSHOT-sources.jar
file.reference.apache-mime4j-dom-0.8.0-SNAPSHOT.jar=release/modules/ext/apache-mime4j-dom-0.8.0-SNAPSHOT.jar
file.reference.apache-mime4j-project-0.8.0-SNAPSHOT.jar=release/modules/ext/apache-mime4j-project-0.8.0-SNAPSHOT.jar
file.reference.java-libpst-1.0-SNAPSHOT.jar=release/modules/ext/java-libpst-1.0-SNAPSHOT.jar
javac.source=1.7
javac.compilerargs=-Xlint -Xlint:-serial

View File

@ -58,10 +58,6 @@
<runtime-relative-path>ext/apache-mime4j-mbox-iterator-0.8.0-SNAPSHOT-sources.jar</runtime-relative-path>
<binary-origin>release/modules/ext/apache-mime4j-mbox-iterator-0.8.0-SNAPSHOT-sources.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/apache-mime4j-project-0.8.0-SNAPSHOT-tests.jar</runtime-relative-path>
<binary-origin>release/modules/ext/apache-mime4j-project-0.8.0-SNAPSHOT-tests.jar</binary-origin>
</class-path-extension>
</data>
</configuration>
</project>

View File

@ -53,56 +53,70 @@ public class EmailMessage {
}
void setRecipients(String recipients) {
if (recipients != null) {
this.recipients = recipients;
}
}
String getSender() {
return sender;
}
void setSender(String sender) {
if (sender != null) {
this.sender = sender;
}
}
String getSubject() {
return subject;
}
void setSubject(String subject) {
if (subject != null) {
this.subject = subject;
}
}
String getTextBody() {
return textBody;
}
void setTextBody(String textBody) {
if (textBody != null) {
this.textBody = textBody;
}
}
String getHtmlBody() {
return htmlBody;
}
void setHtmlBody(String htmlBody) {
if (htmlBody != null) {
this.htmlBody = htmlBody;
}
}
String getRtfBody() {
return rtfBody;
}
void setRtfBody(String rtfBody) {
if (rtfBody != null) {
this.rtfBody = rtfBody;
}
}
long getSentDate() {
return sentDate;
}
void setSentDate(Date sentDate) {
if (sentDate != null) {
this.sentDate = sentDate.getTime() / 1000;
}
}
void setSentDate(long sentDate) {
this.sentDate = sentDate;
@ -113,16 +127,20 @@ public class EmailMessage {
}
void setBcc(String bcc) {
if (bcc != null) {
this.bcc = bcc;
}
}
String getCc() {
return cc;
}
void setCc(String cc) {
if (cc != null) {
this.cc = cc;
}
}
void addAttachment(Attachment a) {
attachments.add(a);
@ -146,9 +164,11 @@ public class EmailMessage {
}
void setLocalPath(String localPath) {
if (localPath != null) {
this.localPath = localPath;
}
}
}
/**
* A Record to hold generic information about attachments.
@ -170,16 +190,20 @@ class Attachment {
}
void setName(String name) {
if (name != null) {
this.name = name;
}
}
String getLocalPath() {
return localPath;
}
void setLocalPath(String localPath) {
if (localPath != null) {
this.localPath = localPath;
}
}
long getSize() {
return size;
@ -198,8 +222,10 @@ class Attachment {
}
void setCrTime(Date crTime) {
if (crTime != null) {
this.crTime = crTime.getTime() / 1000;
}
}
long getcTime() {
return cTime;
@ -210,8 +236,10 @@ class Attachment {
}
void setcTime(Date cTime) {
if (cTime != null) {
this.cTime = cTime.getTime() / 1000;
}
}
long getaTime() {
return aTime;
@ -222,8 +250,10 @@ class Attachment {
}
void setaTime(Date aTime) {
if (aTime != null) {
this.aTime = aTime.getTime() / 1000;
}
}
long getmTime() {
return mTime;
@ -234,6 +264,8 @@ class Attachment {
}
void setmTime(Date mTime) {
if (mTime != null) {
this.mTime = mTime.getTime() / 1000;
}
}
}

View File

@ -18,22 +18,29 @@
*/
package org.sleuthkit.autopsy.thunderbirdparser;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.CharConversionException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.dom.BinaryBody;
import org.apache.james.mime4j.dom.Body;
import org.apache.james.mime4j.dom.Entity;
import org.apache.james.mime4j.dom.Message;
import org.apache.james.mime4j.dom.MessageBuilder;
import org.apache.james.mime4j.dom.Multipart;
import org.apache.james.mime4j.dom.TextBody;
import org.apache.james.mime4j.dom.address.AddressList;
@ -44,6 +51,10 @@ import org.apache.james.mime4j.dom.field.ContentTypeField;
import org.apache.james.mime4j.mboxiterator.CharBufferWrapper;
import org.apache.james.mime4j.mboxiterator.MboxIterator;
import org.apache.james.mime4j.message.DefaultMessageBuilder;
import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.sleuthkit.autopsy.ingest.IngestServices;
/**
* A parser that extracts information about email messages and attachments from
@ -53,7 +64,9 @@ import org.apache.james.mime4j.message.DefaultMessageBuilder;
*/
public class MboxParser {
private static final Logger logger = Logger.getLogger(MboxParser.class.getName());
private MessageBuilder messageBuilder;
private DefaultMessageBuilder messageBuilder;
private IngestServices services;
/**
* The mime type string for html text.
*/
@ -64,9 +77,13 @@ public class MboxParser {
*/
private String localPath;
MboxParser(String localPath) {
MboxParser(IngestServices services, String localPath) {
this.services = services;
this.localPath = localPath;
messageBuilder = new DefaultMessageBuilder();
MimeConfig config = MimeConfig.custom().setMaxLineLen(-1).build();
// disable line length checks.
messageBuilder.setMimeEntityConfig(config);
}
static boolean isValidMimeTypeMbox(byte[] buffer) {
@ -79,24 +96,50 @@ public class MboxParser {
* @return a list of the email messages in the mbox file.
*/
List<EmailMessage> parse(File mboxFile) {
//JWTODO: detect charset
CharsetEncoder encoder = StandardCharsets.ISO_8859_1.newEncoder();
List<EmailMessage> emails = new ArrayList<>();
// Detect possible charsets
List<CharsetEncoder> encoders = getPossibleEncoders(mboxFile);
CharsetEncoder theEncoder = null;
Iterable<CharBufferWrapper> mboxIterator = null;
// Loop through the possible encoders and find the first one that works.
// That will usually be one of the first ones.
for (CharsetEncoder encoder : encoders) {
try {
for (CharBufferWrapper message : MboxIterator.fromFile(mboxFile).charset(encoder.charset()).build()) {
try {
Message msg = messageBuilder.parseMessage(message.asInputStream(encoder.charset()));
emails.add(extractEmail(msg));
} catch (MimeException ex) {
logger.log(Level.WARNING, "Failed to get message from mbox.", ex);
}
}
} catch (FileNotFoundException ex) {
logger.log(Level.WARNING, "couldn't find mbox file.", ex);
mboxIterator = MboxIterator.fromFile(mboxFile).charset(encoder.charset()).build();
theEncoder = encoder;
break;
} catch (CharConversionException | UnsupportedCharsetException ex) {
// Not the right encoder
} catch (IllegalArgumentException ex) {
// Not the right encoder
} catch (IOException ex) {
logger.log(Level.WARNING, "Error getting messsages from mbox file.");
logger.log(Level.WARNING, "couldn't find mbox file.", ex);
//JWTODO: post inbox message
return Collections.EMPTY_LIST;
}
}
// If no encoders work, post an error message and return.
if (mboxIterator == null || theEncoder == null) {
//JWTODO: post inbox message
return Collections.EMPTY_LIST;
}
List<EmailMessage> emails = new ArrayList<>();
long failCount = 0;
// Parse each message and extract an EmailMessage structure
for (CharBufferWrapper message : mboxIterator) {
try {
Message msg = messageBuilder.parseMessage(message.asInputStream(theEncoder.charset()));
emails.add(extractEmail(msg));
} catch (IOException ex) {
logger.log(Level.WARNING, "Failed to get message from mbox: " + ex.getMessage());
failCount++;
}
}
//JWTODO: post inbox message w/ fail count
return emails;
}
@ -133,6 +176,7 @@ public class MboxParser {
* Recursively calls handleMultipart if one of the body parts is another
* multipart. Otherwise, calls the correct method to extract information out
* of each part of the body.
*
* @param email
* @param multi
*/
@ -147,7 +191,7 @@ public class MboxParser {
e.getMimeType().equals(ContentTypeField.TYPE_TEXT_PLAIN)) {
handleTextBody(email, (TextBody) e.getBody(), e.getMimeType());
} else {
logger.log(Level.INFO, "Found unrecognized entity: " + e);
// Ignore other types.
}
}
}
@ -179,7 +223,8 @@ public class MboxParser {
email.setHtmlBody(bodyString.toString());
break;
default:
logger.log(Level.INFO, "Found unrecognized mime type: " + type);
// Not interested in other text types.
break;
}
} catch (IOException ex) {
logger.log(Level.WARNING, "Error getting text body of mbox message", ex);
@ -195,21 +240,30 @@ public class MboxParser {
private void handleAttachment(EmailMessage email, Entity e) {
String outputDirPath = ThunderbirdMboxFileIngestModule.getModuleOutputPath() + File.separator;
String filename = e.getFilename();
String outPath = outputDirPath + filename;
String uniqueFilename = filename + "-" + email.getSentDate();
String outPath = outputDirPath + uniqueFilename;
FileOutputStream fos;
BinaryBody bb;
try {
fos = new FileOutputStream(outPath);
} catch (FileNotFoundException ex) {
logger.log(Level.INFO, "", ex);
//JWTODO: post ingest message
logger.log(Level.INFO, "Failed to create file output stream for: " + outPath, ex);
return;
}
try {
bb = (BinaryBody) e.getBody();
Body b = e.getBody();
if (b instanceof BinaryBody) {
bb = (BinaryBody) b;
bb.writeTo(fos);
} else {
// This could potentially be other types. Only seen this once.
}
} catch (IOException ex) {
logger.log(Level.INFO, "", ex);
logger.log(Level.INFO, "Failed to write mbox email attachment to disk.", ex);
//JWTODO: post ingest message.
return;
} finally {
try {
@ -222,15 +276,8 @@ public class MboxParser {
Attachment attach = new Attachment();
attach.setName(filename);
attach.setLocalPath(ThunderbirdMboxFileIngestModule.getRelModuleOutputPath()
+ File.separator + filename);
// JWTODO: find appropriate constant or make one.
// ContentDispositionField disposition = (ContentDispositionField) e.getHeader().getField("Content-Disposition");
// if (disposition != null) {
// attach.setSize(disposition.getSize());
// attach.setCrTime(disposition.getCreationDate());
// attach.setmTime(disposition.getModificationDate());
// attach.setaTime(disposition.getReadDate());
// }
+ File.separator + uniqueFilename);
attach.setSize(new File(outPath).length());
email.addAttachment(attach);
}
@ -260,4 +307,52 @@ public class MboxParser {
private String getAddresses(AddressList addressList) {
return (addressList == null) ? "" : getAddresses(addressList.flatten());
}
/**
* Get a list of the possible encoders for the given mboxFile using Tika's
* CharsetDetector. At a minimum, returns the standard built in charsets.
* @param mboxFile
* @return
*/
private List<CharsetEncoder> getPossibleEncoders(File mboxFile) {
InputStream is;
List<CharsetEncoder> possibleEncoders = new ArrayList<>();
possibleEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
possibleEncoders.add(StandardCharsets.US_ASCII.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_16.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_16BE.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_16LE.newEncoder());
possibleEncoders.add(StandardCharsets.UTF_8.newEncoder());
try {
is = new BufferedInputStream(new FileInputStream(mboxFile));
} catch (FileNotFoundException ex) {
logger.log(Level.WARNING, "Failed to find mbox file while detecting charset");
return possibleEncoders;
}
try {
CharsetDetector detector = new CharsetDetector();
detector.setText(is);
CharsetMatch[] matches = detector.detectAll();
for (CharsetMatch match : matches) {
try {
possibleEncoders.add(Charset.forName(match.getName()).newEncoder());
} catch (UnsupportedCharsetException | IllegalCharsetNameException ex) {
// Don't add unsupported charsets to the list
}
}
return possibleEncoders;
} catch (IOException | IllegalArgumentException ex) {
logger.log(Level.WARNING, "Failed to detect charset of mbox file.", ex);
return possibleEncoders;
} finally {
try {
is.close();
} catch (IOException ex) {
logger.log(Level.INFO, "Failed to close input stream");
}
}
}
}

View File

@ -45,14 +45,17 @@ import org.sleuthkit.datamodel.TskCoreException;
*/
public class PstParser {
private static final Logger logger = Logger.getLogger(PstParser.class.getName());
/**
* First four bytes of a pst file.
*/
private static int PST_HEADER = 0x2142444E;
private IngestServices services;
/**
* A map of PSTMessages to their Local path within the file's internal
* directory structure.
*/
private List<EmailMessage> results;
private IngestServices services;
PstParser(IngestServices services) {
results = new ArrayList<>();
this.services = services;
@ -70,9 +73,11 @@ public class PstParser {
*/
ParseResult parse(File file) {
PSTFile pstFile;
long failures = 0L;
try {
pstFile = new PSTFile(file);
processFolder(pstFile.getRootFolder(), "\\", true);
failures = processFolder(pstFile.getRootFolder(), "\\", true);
//JWTODO: post ingest message if failures.
return ParseResult.OK;
} catch (PSTException | IOException ex) {
String msg = file.getName() + ": Failed to create internal java-libpst PST file to parse:\n" + ex.getMessage();
@ -102,9 +107,9 @@ public class PstParser {
* @throws PSTException
* @throws IOException
*/
private void processFolder(PSTFolder folder, String path, boolean root) {
private long processFolder(PSTFolder folder, String path, boolean root) {
String newPath = (root ? path : path + "\\" + folder.getDisplayName());
long failCount = 0L; // Number of emails that failed
if (folder.hasSubfolders()) {
List<PSTFolder> subFolders;
try {
@ -115,7 +120,7 @@ public class PstParser {
}
for (PSTFolder f : subFolders) {
processFolder(f, newPath, false);
failCount += processFolder(f, newPath, false);
}
}
@ -127,9 +132,12 @@ public class PstParser {
results.add(extractEmailMessage(email, newPath));
}
} catch (PSTException | IOException ex) {
failCount++;
logger.log(Level.INFO, "java-libpst exception while getting emails from a folder: " + ex.getMessage());
}
}
return failCount;
}
/**
@ -186,15 +194,15 @@ public class PstParser {
if (filename.isEmpty()) {
filename = attach.getFilename();
}
filename = msg.getDescriptorNodeId() + "-" + filename;
String outPath = outputDirPath + filename;
String uniqueFilename = msg.getDescriptorNodeId() + "-" + filename;
String outPath = outputDirPath + uniqueFilename;
saveAttachmentToDisk(attach, outPath);
Attachment attachment = new Attachment();
long crTime = attach.getCreationTime().getTime() / 1000;
long mTime = attach.getModificationTime().getTime() / 1000;
String relPath = getRelModuleOutputPath() + File.separator + filename;
String relPath = getRelModuleOutputPath() + File.separator + uniqueFilename;
attachment.setName(filename);
attachment.setCrTime(crTime);
attachment.setmTime(mTime);
@ -202,7 +210,8 @@ public class PstParser {
attachment.setSize(attach.getFilesize());
email.addAttachment(attachment);
} catch (PSTException | IOException ex) {
logger.log(Level.WARNING, "Failed to extract attachment.", ex);
//JWTODO post ingest message
logger.log(Level.WARNING, "Failed to extract attachment from pst file.", ex);
}
}
}
@ -268,7 +277,7 @@ public class PstParser {
ByteBuffer bb = ByteBuffer.wrap(buffer);
return bb.getInt() == PST_HEADER;
} catch (TskCoreException ex) {
System.out.println("Exception");
logger.log(Level.WARNING, "Exception while detecting if a file is a pst file.");
return false;
}
}

View File

@ -215,7 +215,7 @@ public class ThunderbirdMboxFileIngestModule extends IngestModuleAbstractFile {
return ProcessResult.OK;
}
MboxParser parser = new MboxParser(emailFolder);
MboxParser parser = new MboxParser(services, emailFolder);
List<EmailMessage> emails = parser.parse(file);
processEmails(emails, abstractFile, ingestContext);