diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/MessageURLAnalyzer.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/MessageURLAnalyzer.java index 94e678aaa1..e30a468c1d 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/MessageURLAnalyzer.java +++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/MessageURLAnalyzer.java @@ -39,11 +39,13 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.openide.util.Exceptions; import org.openide.util.NbBundle.Messages; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.NetworkUtils; import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress; import org.sleuthkit.autopsy.ingest.IngestJobContext; +import org.sleuthkit.autopsy.ingest.IngestModule; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE; @@ -52,6 +54,11 @@ import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; +/** + * Analyzes a URL to determine if the url host is one that handles messages + * (i.e. webmail, disposable mail). If found, a domain category type artifact is + * created. + */ @Messages({ "MessageURLAnalyzer_moduleName_text=MessageURLAnalyzer", "MessageURLAnalyzer_Progress_Message_Find_Message_URLs=Finding Messaging Domains", @@ -59,6 +66,9 @@ import org.sleuthkit.datamodel.TskCoreException; }) class MessageURLAnalyzer extends Extract { + /** + * The message service type (i.e. webmail, disposable mail). + */ @Messages({ "MessageType_disposableMail_displayName=Disposable Email", "MessageType_webmail_displayName=Web Email" @@ -70,25 +80,51 @@ class MessageURLAnalyzer extends Extract { private final String csvId; private final String attrDisplayName; + /** + * Main constructor. + * + * @param csvId The identifier within the csv for this type. + * @param attrDisplayName The display name in the artifact for this + * domain category. + */ private MessageType(String csvId, String attrDisplayName) { this.csvId = csvId; this.attrDisplayName = attrDisplayName; } - public String getCsvId() { + /** + * @return The identifier within the csv for this type. + */ + String getCsvId() { return csvId; } - public String getAttrDisplayName() { + /** + * @return The display name in the artifact for this domain category. + */ + String getAttrDisplayName() { return attrDisplayName; } } + /** + * A node in the trie indicating a domain suffix token. For instance, the + * csv entry: "hotmail.com,webmail" would get parsed to a node, "com" having + * a child of "hotmail". That child node, as a leaf, would have a webmail + * message type. + */ private static class MessageDomainTrieNode { private final Map children = new HashMap<>(); private MessageType messageType = null; + /** + * Retrieves the child node of the given key. If that child key does not + * exist, a child node of that key is created and returned. + * + * @param childKey The key for the child (i.e. "com"). + * @return The retrieved or newly created child node. + */ MessageDomainTrieNode getOrAddChild(String childKey) { MessageDomainTrieNode child = children.get(childKey); if (child == null) { @@ -99,19 +135,40 @@ class MessageURLAnalyzer extends Extract { return child; } + /** + * Retrieves the child node of the given key or returns null if child + * does not exist. + * + * @param childKey The key for the child node (i.e. "com"). + * @return The child node or null if it does not exist. + */ MessageDomainTrieNode getChild(String childKey) { return children.get(childKey); } + /** + * @return If this is a leaf node, the type of message for this node. + */ MessageType getMessageType() { return messageType; } + /** + * If this is a leaf node, this sets the message type for this node. + * + * @param messageType The message type for this leaf node. + */ void setMessageType(MessageType messageType) { this.messageType = messageType; } } + /** + * Loads the trie of suffixes from the csv resource file. + * + * @return The root trie node. + * @throws IOException + */ private static MessageDomainTrieNode loadTrie() throws IOException { try (InputStream is = MessageURLAnalyzer.class.getResourceAsStream(MESSAGE_TYPE_CSV); InputStreamReader isReader = new InputStreamReader(is, StandardCharsets.UTF_8); @@ -131,13 +188,27 @@ class MessageURLAnalyzer extends Extract { } } + /** + * Adds a trie node based on the csv line. + * + * @param trie The root trie node. + * @param line The line to be parsed. + * @param lineNumber The line number of this csv line. + */ private static void addItem(MessageDomainTrieNode trie, String line, int lineNumber) { + // make sure this isn't a blank line. + if (StringUtils.isBlank(line)) { + return; + } + String[] csvItems = line.split(CSV_DELIMITER); + // line should be a key value pair if (csvItems.length < 2) { logger.log(Level.WARNING, String.format("Unable to properly parse line of \"%s\" at line %d", line, lineNumber)); return; } + // determine the message type from the value, and return if can't be determined. String messageTypeStr = csvItems[1].trim(); MessageType messageType = (StringUtils.isNotBlank(messageTypeStr)) @@ -152,6 +223,7 @@ class MessageURLAnalyzer extends Extract { return; } + // gather the domainSuffix and parse into domain trie tokens String domainSuffix = csvItems[0]; if (StringUtils.isBlank(domainSuffix)) { logger.log(Level.WARNING, String.format("Could not determine domain suffix for this line: \"%s\" at line %d", line, lineNumber)); @@ -160,6 +232,7 @@ class MessageURLAnalyzer extends Extract { String[] domainTokens = domainSuffix.trim().toLowerCase().split(DELIMITER); + // add into the trie MessageDomainTrieNode node = trie; for (int i = domainTokens.length - 1; i >= 0; i--) { String token = domainTokens[i]; @@ -174,23 +247,6 @@ class MessageURLAnalyzer extends Extract { } - private static MessageDomainTrieNode trieSingleton = null; - private static final Object trieLock = new Object(); - - private static MessageDomainTrieNode getTrie() { - synchronized (trieLock) { - if (trieSingleton == null) { - try { - trieSingleton = loadTrie(); - } catch (IOException ex) { - logger.log(Level.SEVERE, "Unable to load message domain csv", ex); - } - } - } - - return trieSingleton; - } - // Character for joining domain segments. private static final String JOINER = "."; // delimiter when used with regex for domains @@ -218,19 +274,30 @@ class MessageURLAnalyzer extends Extract { private static final Logger logger = Logger.getLogger(MessageURLAnalyzer.class.getName()); - private final MessageDomainTrieNode rootTrie; + // the root node for the trie containing suffixes for domain categories. + private MessageDomainTrieNode rootTrie = null; private Content dataSource; private IngestJobContext context; + /** + * Main constructor. + */ MessageURLAnalyzer() { moduleName = null; - rootTrie = getTrie(); } + /** + * Attempts to determine the host from the url string. If none can be + * determined, returns null. + * + * @param urlString The url string. + * @return The host or null if cannot be determined. + */ private String getHost(String urlString) { String host = null; try { + // try first using the built-in url class to determine the host. URL url = new URL(urlString); if (url != null) { host = url.getHost(); @@ -239,6 +306,7 @@ class MessageURLAnalyzer extends Extract { // ignore this and go to fallback regex } + // if the built-in url parsing doesn't work, then use more flexible regex. if (StringUtils.isBlank(host)) { Matcher m = URL_REGEX.matcher(urlString); if (m.find()) { @@ -249,23 +317,37 @@ class MessageURLAnalyzer extends Extract { return host; } + /** + * Determines if the host is a message type domain. If so, returns the + * portion of the host suffix that signifies the message domain (i.e. + * "hotmail.com" or "mail.google.com") and the message type. + * + * @param host The host. + * @return A pair of the host suffix and message type for that suffix if + * found. Otherwise, returns null. + */ private Pair findHostSuffix(String host) { + // if no host, return none. if (StringUtils.isBlank(host)) { return null; } + // parse the tokens splitting on delimiter List tokens = Stream.of(host.toLowerCase().split(DELIMITER)) .filter(StringUtils::isNotBlank) .collect(Collectors.toList()); MessageDomainTrieNode node = rootTrie; + // the root node is null indicating we won't be able to do a lookup. if (node == null) { return null; } + // iterate through tokens in reverse order int idx = tokens.size() - 1; for (; idx >= 0; idx--) { node = node.getChild(tokens.get(idx)); + // if we hit a leaf node or we have no matching child node, continue. if (node == null || node.getMessageType() != null) { break; } @@ -276,7 +358,8 @@ class MessageURLAnalyzer extends Extract { if (messageType == null) { return null; } else { - // the index to be included should be one higher than last index (that was + // if there is a message type, we have a result. Concatenate the + // appropriate domain tokens and return. int minIndex = Math.max(0, idx); List subList = tokens.subList(minIndex, tokens.size()); String hostSuffix = String.join(JOINER, subList); @@ -284,6 +367,11 @@ class MessageURLAnalyzer extends Extract { } } + /** + * Goes through web history artifacts and attempts to determine any hosts of + * a message type. If any are found, a TSK_DOMAIN_CATEGORY artifact is + * created (at most one per host suffix). + */ private void findMessageDomains() { if (this.rootTrie == null) { logger.log(Level.SEVERE, "Not analyzing message domain. No root trie loaded."); @@ -292,44 +380,54 @@ class MessageURLAnalyzer extends Extract { int artifactsAnalyzed = 0; int messageDomainInstancesFound = 0; + + // only one suffix per ingest is captured so this tracks the suffixes seen. Set domainSuffixesSeen = new HashSet<>(); try { - //from blackboard_artifacts Collection listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts( Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)), Arrays.asList(dataSource.getId())); logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS + for (BlackboardArtifact artifact : listArtifacts) { + // make sure we haven't cancelled if (context.dataSourceIngestIsCancelled()) { break; //User cancelled the process. } + // make sure there is attached file AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID()); if (file == null) { continue; } + // get the url string from the artifact BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL)); if (urlAttr == null) { continue; } String urlString = urlAttr.getValueString(); + + // atempt to get the host from the url provided. String host = getHost(urlString); if (StringUtils.isBlank(host)) { continue; } + // if we reached this point, we are at least analyzing this item artifactsAnalyzed++; + // attempt to get the message type for the host using the suffix trie Pair messageEntryFound = findHostSuffix(host); if (messageEntryFound == null) { continue; } + // if we got this far, we found a message domain, but it may not be unique messageDomainInstancesFound++; String hostSuffix = messageEntryFound.getLeft(); @@ -338,6 +436,8 @@ class MessageURLAnalyzer extends Extract { continue; } + // if we got this far, this is a unique suffix. Add to the set, so we don't create + // multiple of same suffix and add an artifact. domainSuffixesSeen.add(hostSuffix); String moduleName = Bundle.MessageURLAnalyzer_parentModuleName(); @@ -370,6 +470,15 @@ class MessageURLAnalyzer extends Extract { this.findMessageDomains(); } + @Override + void configExtractor() throws IngestModule.IngestModuleException { + try { + this.rootTrie = loadTrie(); + } catch (IOException ex) { + throw new IngestModule.IngestModuleException("Unable to load message type csv for domain category analysis", ex); + } + } + @Override public void complete() { logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS