commenting

This commit is contained in:
Greg DiCristofaro 2020-12-03 12:37:44 -05:00
parent b7baa5e2c8
commit 03a23f636b

View File

@ -39,11 +39,13 @@ import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
import org.openide.util.Exceptions;
import org.openide.util.NbBundle.Messages; import org.openide.util.NbBundle.Messages;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.NetworkUtils; import org.sleuthkit.autopsy.coreutils.NetworkUtils;
import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress; import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.ingest.IngestModule;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE; import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
@ -52,6 +54,11 @@ import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
/**
* Analyzes a URL to determine if the url host is one that handles messages
* (i.e. webmail, disposable mail). If found, a domain category type artifact is
* created.
*/
@Messages({ @Messages({
"MessageURLAnalyzer_moduleName_text=MessageURLAnalyzer", "MessageURLAnalyzer_moduleName_text=MessageURLAnalyzer",
"MessageURLAnalyzer_Progress_Message_Find_Message_URLs=Finding Messaging Domains", "MessageURLAnalyzer_Progress_Message_Find_Message_URLs=Finding Messaging Domains",
@ -59,6 +66,9 @@ import org.sleuthkit.datamodel.TskCoreException;
}) })
class MessageURLAnalyzer extends Extract { class MessageURLAnalyzer extends Extract {
/**
* The message service type (i.e. webmail, disposable mail).
*/
@Messages({ @Messages({
"MessageType_disposableMail_displayName=Disposable Email", "MessageType_disposableMail_displayName=Disposable Email",
"MessageType_webmail_displayName=Web Email" "MessageType_webmail_displayName=Web Email"
@ -70,25 +80,51 @@ class MessageURLAnalyzer extends Extract {
private final String csvId; private final String csvId;
private final String attrDisplayName; private final String attrDisplayName;
/**
* Main constructor.
*
* @param csvId The identifier within the csv for this type.
* @param attrDisplayName The display name in the artifact for this
* domain category.
*/
private MessageType(String csvId, String attrDisplayName) { private MessageType(String csvId, String attrDisplayName) {
this.csvId = csvId; this.csvId = csvId;
this.attrDisplayName = attrDisplayName; this.attrDisplayName = attrDisplayName;
} }
public String getCsvId() { /**
* @return The identifier within the csv for this type.
*/
String getCsvId() {
return csvId; return csvId;
} }
public String getAttrDisplayName() { /**
* @return The display name in the artifact for this domain category.
*/
String getAttrDisplayName() {
return attrDisplayName; return attrDisplayName;
} }
} }
/**
* A node in the trie indicating a domain suffix token. For instance, the
* csv entry: "hotmail.com,webmail" would get parsed to a node, "com" having
* a child of "hotmail". That child node, as a leaf, would have a webmail
* message type.
*/
private static class MessageDomainTrieNode { private static class MessageDomainTrieNode {
private final Map<String, MessageDomainTrieNode> children = new HashMap<>(); private final Map<String, MessageDomainTrieNode> children = new HashMap<>();
private MessageType messageType = null; private MessageType messageType = null;
/**
* Retrieves the child node of the given key. If that child key does not
* exist, a child node of that key is created and returned.
*
* @param childKey The key for the child (i.e. "com").
* @return The retrieved or newly created child node.
*/
MessageDomainTrieNode getOrAddChild(String childKey) { MessageDomainTrieNode getOrAddChild(String childKey) {
MessageDomainTrieNode child = children.get(childKey); MessageDomainTrieNode child = children.get(childKey);
if (child == null) { if (child == null) {
@ -99,19 +135,40 @@ class MessageURLAnalyzer extends Extract {
return child; return child;
} }
/**
* Retrieves the child node of the given key or returns null if child
* does not exist.
*
* @param childKey The key for the child node (i.e. "com").
* @return The child node or null if it does not exist.
*/
MessageDomainTrieNode getChild(String childKey) { MessageDomainTrieNode getChild(String childKey) {
return children.get(childKey); return children.get(childKey);
} }
/**
* @return If this is a leaf node, the type of message for this node.
*/
MessageType getMessageType() { MessageType getMessageType() {
return messageType; return messageType;
} }
/**
* If this is a leaf node, this sets the message type for this node.
*
* @param messageType The message type for this leaf node.
*/
void setMessageType(MessageType messageType) { void setMessageType(MessageType messageType) {
this.messageType = messageType; this.messageType = messageType;
} }
} }
/**
* Loads the trie of suffixes from the csv resource file.
*
* @return The root trie node.
* @throws IOException
*/
private static MessageDomainTrieNode loadTrie() throws IOException { private static MessageDomainTrieNode loadTrie() throws IOException {
try (InputStream is = MessageURLAnalyzer.class.getResourceAsStream(MESSAGE_TYPE_CSV); try (InputStream is = MessageURLAnalyzer.class.getResourceAsStream(MESSAGE_TYPE_CSV);
InputStreamReader isReader = new InputStreamReader(is, StandardCharsets.UTF_8); InputStreamReader isReader = new InputStreamReader(is, StandardCharsets.UTF_8);
@ -131,13 +188,27 @@ class MessageURLAnalyzer extends Extract {
} }
} }
/**
* Adds a trie node based on the csv line.
*
* @param trie The root trie node.
* @param line The line to be parsed.
* @param lineNumber The line number of this csv line.
*/
private static void addItem(MessageDomainTrieNode trie, String line, int lineNumber) { private static void addItem(MessageDomainTrieNode trie, String line, int lineNumber) {
// make sure this isn't a blank line.
if (StringUtils.isBlank(line)) {
return;
}
String[] csvItems = line.split(CSV_DELIMITER); String[] csvItems = line.split(CSV_DELIMITER);
// line should be a key value pair
if (csvItems.length < 2) { if (csvItems.length < 2) {
logger.log(Level.WARNING, String.format("Unable to properly parse line of \"%s\" at line %d", line, lineNumber)); logger.log(Level.WARNING, String.format("Unable to properly parse line of \"%s\" at line %d", line, lineNumber));
return; return;
} }
// determine the message type from the value, and return if can't be determined.
String messageTypeStr = csvItems[1].trim(); String messageTypeStr = csvItems[1].trim();
MessageType messageType = (StringUtils.isNotBlank(messageTypeStr)) MessageType messageType = (StringUtils.isNotBlank(messageTypeStr))
@ -152,6 +223,7 @@ class MessageURLAnalyzer extends Extract {
return; return;
} }
// gather the domainSuffix and parse into domain trie tokens
String domainSuffix = csvItems[0]; String domainSuffix = csvItems[0];
if (StringUtils.isBlank(domainSuffix)) { if (StringUtils.isBlank(domainSuffix)) {
logger.log(Level.WARNING, String.format("Could not determine domain suffix for this line: \"%s\" at line %d", line, lineNumber)); logger.log(Level.WARNING, String.format("Could not determine domain suffix for this line: \"%s\" at line %d", line, lineNumber));
@ -160,6 +232,7 @@ class MessageURLAnalyzer extends Extract {
String[] domainTokens = domainSuffix.trim().toLowerCase().split(DELIMITER); String[] domainTokens = domainSuffix.trim().toLowerCase().split(DELIMITER);
// add into the trie
MessageDomainTrieNode node = trie; MessageDomainTrieNode node = trie;
for (int i = domainTokens.length - 1; i >= 0; i--) { for (int i = domainTokens.length - 1; i >= 0; i--) {
String token = domainTokens[i]; String token = domainTokens[i];
@ -174,23 +247,6 @@ class MessageURLAnalyzer extends Extract {
} }
private static MessageDomainTrieNode trieSingleton = null;
private static final Object trieLock = new Object();
private static MessageDomainTrieNode getTrie() {
synchronized (trieLock) {
if (trieSingleton == null) {
try {
trieSingleton = loadTrie();
} catch (IOException ex) {
logger.log(Level.SEVERE, "Unable to load message domain csv", ex);
}
}
}
return trieSingleton;
}
// Character for joining domain segments. // Character for joining domain segments.
private static final String JOINER = "."; private static final String JOINER = ".";
// delimiter when used with regex for domains // delimiter when used with regex for domains
@ -218,19 +274,30 @@ class MessageURLAnalyzer extends Extract {
private static final Logger logger = Logger.getLogger(MessageURLAnalyzer.class.getName()); private static final Logger logger = Logger.getLogger(MessageURLAnalyzer.class.getName());
private final MessageDomainTrieNode rootTrie; // the root node for the trie containing suffixes for domain categories.
private MessageDomainTrieNode rootTrie = null;
private Content dataSource; private Content dataSource;
private IngestJobContext context; private IngestJobContext context;
/**
* Main constructor.
*/
MessageURLAnalyzer() { MessageURLAnalyzer() {
moduleName = null; moduleName = null;
rootTrie = getTrie();
} }
/**
* Attempts to determine the host from the url string. If none can be
* determined, returns null.
*
* @param urlString The url string.
* @return The host or null if cannot be determined.
*/
private String getHost(String urlString) { private String getHost(String urlString) {
String host = null; String host = null;
try { try {
// try first using the built-in url class to determine the host.
URL url = new URL(urlString); URL url = new URL(urlString);
if (url != null) { if (url != null) {
host = url.getHost(); host = url.getHost();
@ -239,6 +306,7 @@ class MessageURLAnalyzer extends Extract {
// ignore this and go to fallback regex // ignore this and go to fallback regex
} }
// if the built-in url parsing doesn't work, then use more flexible regex.
if (StringUtils.isBlank(host)) { if (StringUtils.isBlank(host)) {
Matcher m = URL_REGEX.matcher(urlString); Matcher m = URL_REGEX.matcher(urlString);
if (m.find()) { if (m.find()) {
@ -249,23 +317,37 @@ class MessageURLAnalyzer extends Extract {
return host; return host;
} }
/**
* Determines if the host is a message type domain. If so, returns the
* portion of the host suffix that signifies the message domain (i.e.
* "hotmail.com" or "mail.google.com") and the message type.
*
* @param host The host.
* @return A pair of the host suffix and message type for that suffix if
* found. Otherwise, returns null.
*/
private Pair<String, MessageType> findHostSuffix(String host) { private Pair<String, MessageType> findHostSuffix(String host) {
// if no host, return none.
if (StringUtils.isBlank(host)) { if (StringUtils.isBlank(host)) {
return null; return null;
} }
// parse the tokens splitting on delimiter
List<String> tokens = Stream.of(host.toLowerCase().split(DELIMITER)) List<String> tokens = Stream.of(host.toLowerCase().split(DELIMITER))
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)
.collect(Collectors.toList()); .collect(Collectors.toList());
MessageDomainTrieNode node = rootTrie; MessageDomainTrieNode node = rootTrie;
// the root node is null indicating we won't be able to do a lookup.
if (node == null) { if (node == null) {
return null; return null;
} }
// iterate through tokens in reverse order
int idx = tokens.size() - 1; int idx = tokens.size() - 1;
for (; idx >= 0; idx--) { for (; idx >= 0; idx--) {
node = node.getChild(tokens.get(idx)); node = node.getChild(tokens.get(idx));
// if we hit a leaf node or we have no matching child node, continue.
if (node == null || node.getMessageType() != null) { if (node == null || node.getMessageType() != null) {
break; break;
} }
@ -276,7 +358,8 @@ class MessageURLAnalyzer extends Extract {
if (messageType == null) { if (messageType == null) {
return null; return null;
} else { } else {
// the index to be included should be one higher than last index (that was // if there is a message type, we have a result. Concatenate the
// appropriate domain tokens and return.
int minIndex = Math.max(0, idx); int minIndex = Math.max(0, idx);
List<String> subList = tokens.subList(minIndex, tokens.size()); List<String> subList = tokens.subList(minIndex, tokens.size());
String hostSuffix = String.join(JOINER, subList); String hostSuffix = String.join(JOINER, subList);
@ -284,6 +367,11 @@ class MessageURLAnalyzer extends Extract {
} }
} }
/**
* Goes through web history artifacts and attempts to determine any hosts of
* a message type. If any are found, a TSK_DOMAIN_CATEGORY artifact is
* created (at most one per host suffix).
*/
private void findMessageDomains() { private void findMessageDomains() {
if (this.rootTrie == null) { if (this.rootTrie == null) {
logger.log(Level.SEVERE, "Not analyzing message domain. No root trie loaded."); logger.log(Level.SEVERE, "Not analyzing message domain. No root trie loaded.");
@ -292,44 +380,54 @@ class MessageURLAnalyzer extends Extract {
int artifactsAnalyzed = 0; int artifactsAnalyzed = 0;
int messageDomainInstancesFound = 0; int messageDomainInstancesFound = 0;
// only one suffix per ingest is captured so this tracks the suffixes seen.
Set<String> domainSuffixesSeen = new HashSet<>(); Set<String> domainSuffixesSeen = new HashSet<>();
try { try {
//from blackboard_artifacts
Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts( Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)), Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
Arrays.asList(dataSource.getId())); Arrays.asList(dataSource.getId()));
logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
for (BlackboardArtifact artifact : listArtifacts) { for (BlackboardArtifact artifact : listArtifacts) {
// make sure we haven't cancelled
if (context.dataSourceIngestIsCancelled()) { if (context.dataSourceIngestIsCancelled()) {
break; //User cancelled the process. break; //User cancelled the process.
} }
// make sure there is attached file
AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID()); AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
if (file == null) { if (file == null) {
continue; continue;
} }
// get the url string from the artifact
BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL)); BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
if (urlAttr == null) { if (urlAttr == null) {
continue; continue;
} }
String urlString = urlAttr.getValueString(); String urlString = urlAttr.getValueString();
// atempt to get the host from the url provided.
String host = getHost(urlString); String host = getHost(urlString);
if (StringUtils.isBlank(host)) { if (StringUtils.isBlank(host)) {
continue; continue;
} }
// if we reached this point, we are at least analyzing this item
artifactsAnalyzed++; artifactsAnalyzed++;
// attempt to get the message type for the host using the suffix trie
Pair<String, MessageType> messageEntryFound = findHostSuffix(host); Pair<String, MessageType> messageEntryFound = findHostSuffix(host);
if (messageEntryFound == null) { if (messageEntryFound == null) {
continue; continue;
} }
// if we got this far, we found a message domain, but it may not be unique
messageDomainInstancesFound++; messageDomainInstancesFound++;
String hostSuffix = messageEntryFound.getLeft(); String hostSuffix = messageEntryFound.getLeft();
@ -338,6 +436,8 @@ class MessageURLAnalyzer extends Extract {
continue; continue;
} }
// if we got this far, this is a unique suffix. Add to the set, so we don't create
// multiple of same suffix and add an artifact.
domainSuffixesSeen.add(hostSuffix); domainSuffixesSeen.add(hostSuffix);
String moduleName = Bundle.MessageURLAnalyzer_parentModuleName(); String moduleName = Bundle.MessageURLAnalyzer_parentModuleName();
@ -370,6 +470,15 @@ class MessageURLAnalyzer extends Extract {
this.findMessageDomains(); this.findMessageDomains();
} }
@Override
void configExtractor() throws IngestModule.IngestModuleException {
try {
this.rootTrie = loadTrie();
} catch (IOException ex) {
throw new IngestModule.IngestModuleException("Unable to load message type csv for domain category analysis", ex);
}
}
@Override @Override
public void complete() { public void complete() {
logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS