2012-06-08 14:22:06 -07:00

169 lines
4.6 KiB
Java

package org.sleuthkit.autopsy.mboxparser;
import java.io.*;
import java.util.ArrayList;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mbox.MboxParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class MboxEmailParser {
private InputStream stream;
//Tika object
private Tika tika;
private Metadata metadata;
private ContentHandler contentHandler;
private String mimeType;
private Parser parser;
private ParseContext context;
private static ArrayList<String> tikaMimeTypes;
static
{
tikaMimeTypes = new ArrayList<String>();
tikaMimeTypes.add(MimeTypes.OCTET_STREAM);
tikaMimeTypes.add(MimeTypes.PLAIN_TEXT);
tikaMimeTypes.add(MimeTypes.XML);
}
public MboxEmailParser()
{
this.tika = new Tika();
}
public MboxEmailParser(InputStream inStream)
{
this.tika = new Tika();
this.stream = inStream;
}
public MboxEmailParser(String filepath)
{
this.tika = new Tika();
this.stream = this.getClass().getResourceAsStream(filepath);
}
private void init() throws IOException
{
this.metadata = new Metadata();
//Set MIME Type
this.mimeType = tika.detect(this.stream);
this.parser = new MboxParser();
this.context = new ParseContext();
this.contentHandler = new BodyContentHandler();
//Seems like setting this causes the metadata not to output all of it.
this.metadata.set(Metadata.CONTENT_TYPE, this.mimeType);
}
public void parse() throws FileNotFoundException, IOException, SAXException, TikaException
{
init();
// this.metadata = new Metadata();
//String mimeType = tika.detect(this.stream);
parser.parse(this.stream,this.contentHandler, this.metadata, context);
}
public void parse(InputStream inStream) throws FileNotFoundException, IOException, SAXException, TikaException
{
init();
parser.parse(inStream,this.contentHandler, this.metadata, context);
}
public Metadata getMetadata()
{
return this.metadata;
}
//Returns message content, i.e. plain text or html
public String getContent()
{
return this.contentHandler.toString();
}
public String detectEmailFileFormat(String filepath) throws IOException
{
return this.tika.detect(filepath);
}
//Detects the mime type from the first few bytes of the document
public String detectMediaTypeFromBytes(byte[] firstFewBytes, String inDocName)
{
return this.tika.detect(firstFewBytes, inDocName);
}
public boolean isValidMimeTypeMbox(byte[] buffer)
{
String outMimeType = this.tika.detect(buffer);
return outMimeType.equals(MimeTypes.OCTET_STREAM) ? true : (outMimeType.equals(MimeTypes.PLAIN_TEXT) ? true : outMimeType.equals(MimeTypes.XML));
}
//This assumes the file/stream was parsed since we are looking at the metadata
public boolean isValidMboxType()
{
return this.metadata.get(Metadata.DATE_CREATED).equals("application/mbox");
}
//Get email subject
public String getSubject()
{
return this.metadata.get(Metadata.SUBJECT);
}
public String getTitle()
{
return this.metadata.get(Metadata.TITLE);
}
public String getDateCreated()
{
return this.metadata.get(Metadata.DATE_CREATED);
}
public String getContenType()
{
return this.metadata.get(Metadata.CONTENT_TYPE);
}
public String getContenEncoding()
{
return this.metadata.get(Metadata.CONTENT_ENCODING);
}
public String getFrom()
{
return this.metadata.get(Metadata.MESSAGE_FROM);
}
public String getTo()
{
return this.metadata.get(Metadata.MESSAGE_TO);
}
public String getCC()
{
return this.metadata.get(Metadata.MESSAGE_CC);
}
public String getBCC()
{
return this.metadata.get(Metadata.MESSAGE_BCC);
}
public String getRecipientAddress()
{
return this.metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS);
}
}