commenting

2025-07-13 08:26:15 +00:00 · 2020-12-03 12:37:44 -05:00 · 2020-12-03 12:37:44 -05:00 · 03a23f636b
commit 03a23f636b
parent b7baa5e2c8
1 changed files with 132 additions and 23 deletions
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/MessageURLAnalyzer.java
+++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/MessageURLAnalyzer.java
@ -39,11 +39,13 @@ import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang3.tuple.Pair;
+import org.openide.util.Exceptions;
 import org.openide.util.NbBundle.Messages;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.autopsy.coreutils.NetworkUtils;
 import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
 import org.sleuthkit.autopsy.ingest.IngestJobContext;
+import org.sleuthkit.autopsy.ingest.IngestModule;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
@ -52,6 +54,11 @@ import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.TskCoreException;

+/**
+ * Analyzes a URL to determine if the url host is one that handles messages
+ * (i.e. webmail, disposable mail). If found, a domain category type artifact is
+ * created.
+ */
@Messages({
    "MessageURLAnalyzer_moduleName_text=MessageURLAnalyzer",
    "MessageURLAnalyzer_Progress_Message_Find_Message_URLs=Finding Messaging Domains",
@ -59,6 +66,9 @@ import org.sleuthkit.datamodel.TskCoreException;
 })
 class MessageURLAnalyzer extends Extract {

+    /**
+     * The message service type (i.e. webmail, disposable mail).
+     */
    @Messages({
        "MessageType_disposableMail_displayName=Disposable Email",
        "MessageType_webmail_displayName=Web Email"
@ -70,25 +80,51 @@ class MessageURLAnalyzer extends Extract {
        private final String csvId;
        private final String attrDisplayName;

+        /**
+         * Main constructor.
+         *
+         * @param csvId The identifier within the csv for this type.
+         * @param attrDisplayName The display name in the artifact for this
+         * domain category.
+         */
        private MessageType(String csvId, String attrDisplayName) {
            this.csvId = csvId;
            this.attrDisplayName = attrDisplayName;
        }

-        public String getCsvId() {
+        /**
+         * @return The identifier within the csv for this type.
+         */
+        String getCsvId() {
            return csvId;
        }

-        public String getAttrDisplayName() {
+        /**
+         * @return The display name in the artifact for this domain category.
+         */
+        String getAttrDisplayName() {
            return attrDisplayName;
        }
    }

+    /**
+     * A node in the trie indicating a domain suffix token. For instance, the
+     * csv entry: "hotmail.com,webmail" would get parsed to a node, "com" having
+     * a child of "hotmail". That child node, as a leaf, would have a webmail
+     * message type.
+     */
    private static class MessageDomainTrieNode {

        private final Map<String, MessageDomainTrieNode> children = new HashMap<>();
        private MessageType messageType = null;

+        /**
+         * Retrieves the child node of the given key. If that child key does not
+         * exist, a child node of that key is created and returned.
+         *
+         * @param childKey The key for the child (i.e. "com").
+         * @return The retrieved or newly created child node.
+         */
        MessageDomainTrieNode getOrAddChild(String childKey) {
            MessageDomainTrieNode child = children.get(childKey);
            if (child == null) {
@ -99,19 +135,40 @@ class MessageURLAnalyzer extends Extract {
            return child;
        }

+        /**
+         * Retrieves the child node of the given key or returns null if child
+         * does not exist.
+         *
+         * @param childKey The key for the child node (i.e. "com").
+         * @return The child node or null if it does not exist.
+         */
        MessageDomainTrieNode getChild(String childKey) {
            return children.get(childKey);
        }

+        /**
+         * @return If this is a leaf node, the type of message for this node.
+         */
        MessageType getMessageType() {
            return messageType;
        }

+        /**
+         * If this is a leaf node, this sets the message type for this node.
+         *
+         * @param messageType The message type for this leaf node.
+         */
        void setMessageType(MessageType messageType) {
            this.messageType = messageType;
        }
    }

+    /**
+     * Loads the trie of suffixes from the csv resource file.
+     *
+     * @return The root trie node.
+     * @throws IOException
+     */
    private static MessageDomainTrieNode loadTrie() throws IOException {
        try (InputStream is = MessageURLAnalyzer.class.getResourceAsStream(MESSAGE_TYPE_CSV);
                InputStreamReader isReader = new InputStreamReader(is, StandardCharsets.UTF_8);
@ -131,13 +188,27 @@ class MessageURLAnalyzer extends Extract {
        }
    }

+    /**
+     * Adds a trie node based on the csv line.
+     *
+     * @param trie The root trie node.
+     * @param line The line to be parsed.
+     * @param lineNumber The line number of this csv line.
+     */
    private static void addItem(MessageDomainTrieNode trie, String line, int lineNumber) {
+        // make sure this isn't a blank line.
+        if (StringUtils.isBlank(line)) {
+            return;
+        }
+
        String[] csvItems = line.split(CSV_DELIMITER);
+        // line should be a key value pair
        if (csvItems.length < 2) {
            logger.log(Level.WARNING, String.format("Unable to properly parse line of \"%s\" at line %d", line, lineNumber));
            return;
        }

+        // determine the message type from the value, and return if can't be determined.
        String messageTypeStr = csvItems[1].trim();

        MessageType messageType = (StringUtils.isNotBlank(messageTypeStr))
@ -152,6 +223,7 @@ class MessageURLAnalyzer extends Extract {
            return;
        }

+        // gather the domainSuffix and parse into domain trie tokens
        String domainSuffix = csvItems[0];
        if (StringUtils.isBlank(domainSuffix)) {
            logger.log(Level.WARNING, String.format("Could not determine domain suffix for this line: \"%s\" at line %d", line, lineNumber));
@ -160,6 +232,7 @@ class MessageURLAnalyzer extends Extract {

        String[] domainTokens = domainSuffix.trim().toLowerCase().split(DELIMITER);

+        // add into the trie
        MessageDomainTrieNode node = trie;
        for (int i = domainTokens.length - 1; i >= 0; i--) {
            String token = domainTokens[i];
@ -174,23 +247,6 @@ class MessageURLAnalyzer extends Extract {

    }

-    private static MessageDomainTrieNode trieSingleton = null;
-    private static final Object trieLock = new Object();
-
-    private static MessageDomainTrieNode getTrie() {
-        synchronized (trieLock) {
-            if (trieSingleton == null) {
-                try {
-                    trieSingleton = loadTrie();
-                } catch (IOException ex) {
-                    logger.log(Level.SEVERE, "Unable to load message domain csv", ex);
-                }
-            }
-        }
-
-        return trieSingleton;
-    }
-
    // Character for joining domain segments.
    private static final String JOINER = ".";
    // delimiter when used with regex for domains
@ -218,19 +274,30 @@ class MessageURLAnalyzer extends Extract {

    private static final Logger logger = Logger.getLogger(MessageURLAnalyzer.class.getName());

-    private final MessageDomainTrieNode rootTrie;
+    // the root node for the trie containing suffixes for domain categories.
+    private MessageDomainTrieNode rootTrie = null;

    private Content dataSource;
    private IngestJobContext context;

+    /**
+     * Main constructor.
+     */
    MessageURLAnalyzer() {
        moduleName = null;
-        rootTrie = getTrie();
    }

+    /**
+     * Attempts to determine the host from the url string. If none can be
+     * determined, returns null.
+     *
+     * @param urlString The url string.
+     * @return The host or null if cannot be determined.
+     */
    private String getHost(String urlString) {
        String host = null;
        try {
+            // try first using the built-in url class to determine the host.
            URL url = new URL(urlString);
            if (url != null) {
                host = url.getHost();
@ -239,6 +306,7 @@ class MessageURLAnalyzer extends Extract {
            // ignore this and go to fallback regex
        }

+        // if the built-in url parsing doesn't work, then use more flexible regex.
        if (StringUtils.isBlank(host)) {
            Matcher m = URL_REGEX.matcher(urlString);
            if (m.find()) {
@ -249,23 +317,37 @@ class MessageURLAnalyzer extends Extract {
        return host;
    }

+    /**
+     * Determines if the host is a message type domain. If so, returns the
+     * portion of the host suffix that signifies the message domain (i.e.
+     * "hotmail.com" or "mail.google.com") and the message type.
+     *
+     * @param host The host.
+     * @return A pair of the host suffix and message type for that suffix if
+     * found. Otherwise, returns null.
+     */
    private Pair<String, MessageType> findHostSuffix(String host) {
+        // if no host, return none.
        if (StringUtils.isBlank(host)) {
            return null;
        }

+        // parse the tokens splitting on delimiter
        List<String> tokens = Stream.of(host.toLowerCase().split(DELIMITER))
                .filter(StringUtils::isNotBlank)
                .collect(Collectors.toList());

        MessageDomainTrieNode node = rootTrie;
+        // the root node is null indicating we won't be able to do a lookup.
        if (node == null) {
            return null;
        }

+        // iterate through tokens in reverse order
        int idx = tokens.size() - 1;
        for (; idx >= 0; idx--) {
            node = node.getChild(tokens.get(idx));
+            // if we hit a leaf node or we have no matching child node, continue.
            if (node == null || node.getMessageType() != null) {
                break;
            }
@ -276,7 +358,8 @@ class MessageURLAnalyzer extends Extract {
        if (messageType == null) {
            return null;
        } else {
-            // the index to be included should be one higher than last index (that was
+            // if there is a message type, we have a result.  Concatenate the 
+            // appropriate domain tokens and return.
            int minIndex = Math.max(0, idx);
            List<String> subList = tokens.subList(minIndex, tokens.size());
            String hostSuffix = String.join(JOINER, subList);
@ -284,6 +367,11 @@ class MessageURLAnalyzer extends Extract {
        }
    }

+    /**
+     * Goes through web history artifacts and attempts to determine any hosts of
+     * a message type. If any are found, a TSK_DOMAIN_CATEGORY artifact is
+     * created (at most one per host suffix).
+     */
    private void findMessageDomains() {
        if (this.rootTrie == null) {
            logger.log(Level.SEVERE, "Not analyzing message domain.  No root trie loaded.");
@ -292,44 +380,54 @@ class MessageURLAnalyzer extends Extract {

        int artifactsAnalyzed = 0;
        int messageDomainInstancesFound = 0;
+        
+        // only one suffix per ingest is captured so this tracks the suffixes seen.
        Set<String> domainSuffixesSeen = new HashSet<>();

        try {
-            //from blackboard_artifacts
            Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
                    Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
                    Arrays.asList(dataSource.getId()));

            logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS

+            
            for (BlackboardArtifact artifact : listArtifacts) {
+                // make sure we haven't cancelled
                if (context.dataSourceIngestIsCancelled()) {
                    break;       //User cancelled the process.
                }

+                // make sure there is attached file
                AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
                if (file == null) {
                    continue;
                }

+                // get the url string from the artifact
                BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
                if (urlAttr == null) {
                    continue;
                }

                String urlString = urlAttr.getValueString();
+                
+                // atempt to get the host from the url provided.
                String host = getHost(urlString);
                if (StringUtils.isBlank(host)) {
                    continue;
                }

+                // if we reached this point, we are at least analyzing this item
                artifactsAnalyzed++;

+                // attempt to get the message type for the host using the suffix trie
                Pair<String, MessageType> messageEntryFound = findHostSuffix(host);
                if (messageEntryFound == null) {
                    continue;
                }

+                // if we got this far, we found a message domain, but it may not be unique
                messageDomainInstancesFound++;

                String hostSuffix = messageEntryFound.getLeft();
@ -338,6 +436,8 @@ class MessageURLAnalyzer extends Extract {
                    continue;
                }

+                // if we got this far, this is a unique suffix.  Add to the set, so we don't create
+                // multiple of same suffix and add an artifact.
                domainSuffixesSeen.add(hostSuffix);

                String moduleName = Bundle.MessageURLAnalyzer_parentModuleName();
@ -370,6 +470,15 @@ class MessageURLAnalyzer extends Extract {
        this.findMessageDomains();
    }

+    @Override
+    void configExtractor() throws IngestModule.IngestModuleException {
+        try {
+            this.rootTrie = loadTrie();
+        } catch (IOException ex) {
+            throw new IngestModule.IngestModuleException("Unable to load message type csv for domain category analysis", ex);
+        }
+    }
+
    @Override
    public void complete() {
        logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS