diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml index 8d4d9b8c98..074d930725 100644 --- a/Core/nbproject/project.xml +++ b/Core/nbproject/project.xml @@ -336,6 +336,7 @@ org.sleuthkit.autopsy.textextractors.configs org.sleuthkit.autopsy.textsummarizer org.sleuthkit.autopsy.texttranslation + org.sleuthkit.autopsy.url.analytics org.sleuthkit.datamodel org.sleuthkit.datamodel.blackboardutils org.sleuthkit.datamodel.blackboardutils.attributes diff --git a/Core/src/org/sleuthkit/autopsy/url/analytics/DefaultDomainCategoryResult.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DefaultDomainCategoryResult.java new file mode 100644 index 0000000000..e0792cd375 --- /dev/null +++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DefaultDomainCategoryResult.java @@ -0,0 +1,72 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2020 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.url.analytics; + +import com.google.common.annotations.Beta; + +/** + * Default implementation of the DomainCategoryResult. + */ +@Beta +public class DefaultDomainCategoryResult implements DomainCategoryResult { + + private final String hostSuffix; + private final String category; + private final boolean morePrefixes; + + /** + * Default constructor assuming default for hasMorePrefixes of true. + * @param hostSuffix The portion of the suffix from the host or domain that was a + * match (i.e. 'mail.google.com' or 'hotmail.com'). + * @param category The category (i.e. 'Web Email'). + */ + public DefaultDomainCategoryResult(String hostSuffix, String category) { + this(hostSuffix, category, true); + } + + /** + * Main constructor. + * @param hostSuffix The portion of the suffix from the host or domain that was a + * match (i.e. 'mail.google.com' or 'hotmail.com'). + * @param category The category (i.e. 'Web Email'). + * @param morePrefixes In the event that there would be different matches for additional + * prefixes, this can be true. + */ + public DefaultDomainCategoryResult(String hostSuffix, String category, boolean morePrefixes) { + this.hostSuffix = hostSuffix; + this.category = category; + this.morePrefixes = morePrefixes; + } + + @Override + public String getHostSuffix() { + return hostSuffix; + } + + @Override + public String getCategory() { + return category; + } + + @Override + public boolean hasMorePrefixes() { + return morePrefixes; + } + +} diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategoryProvider.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryProvider.java similarity index 57% rename from RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategoryProvider.java rename to Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryProvider.java index dc9c215bfc..072119ce4c 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategoryProvider.java +++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryProvider.java @@ -16,38 +16,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.recentactivity; +package org.sleuthkit.autopsy.url.analytics; +import com.google.common.annotations.Beta; import org.sleuthkit.autopsy.ingest.IngestModule; /** - * Interface providing the category of a domain for creating - * TSK_WEB_CATEGORIZATION artifacts. + * Interface providing the category of a domain for the purposes of creating + * TSK_WEB_CATEGORIZATION artifacts. These implementations are used in + * RecentActivity as a part of the ingest process. Implementers of this class + * should have a no-argument constructor in order to be properly instantiated. */ +@Beta public interface DomainCategoryProvider { - public static class DomainCategoryResult { - private final String hostSuffix; - private final String category; - private final boolean hasChildren; - - public DomainCategoryResult(String hostSuffix, String category, boolean hasChildren) { - this.hostSuffix = hostSuffix; - this.category = category; - this.hasChildren = hasChildren; - } - - public String getHostSuffix() { - return hostSuffix; - } - - public String getCategory() { - return category; - } - - public boolean hasChildren() { - return hasChildren; - } - } /** * Provides the DomainCategory for a given domain/host or null if none can @@ -59,6 +40,14 @@ public interface DomainCategoryProvider { * null if not. */ DomainCategoryResult getCategory(String domain, String host); - - void initialize() throws IngestModule.IngestModuleException; + + /** + * Initializes this provider in preparation to handle 'getCategory' requests + * during ingest. Conceivably, the same instance of this class may have this + * called multiple times and should handle that possibility gracefully. + * + * @throws IngestModule.IngestModuleException + */ + default void initialize() throws IngestModule.IngestModuleException { + } } diff --git a/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryResult.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryResult.java new file mode 100644 index 0000000000..c52780a587 --- /dev/null +++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryResult.java @@ -0,0 +1,50 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2020 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.url.analytics; + +import com.google.common.annotations.Beta; + +/** + * The result of finding a match for the host or domain provided as an argument. + */ +@Beta +public interface DomainCategoryResult { + /** + * @return The portion of the suffix from the host or domain that was a + * match (i.e. 'mail.google.com' or 'hotmail.com'). + */ + String getHostSuffix(); + + /** + * @return The category (i.e. 'Web Email'). + */ + String getCategory(); + + /** + * @return In the event that there would be different matches for additional + * prefixes, this can return true. For instance, if there was an entry for + * 'mail.google.com' and 'chatenabled.mail.google.com', a search for + * 'mail.google.com' would return the host suffix: 'mail.google.com' and + * 'true' for hasMorePrefixes since an additional category could be added + * for the 'chatenabled' prefix. + */ + default boolean hasMorePrefixes() { + return true; + } +} diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainSuffixTrie.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainSuffixTrie.java similarity index 73% rename from RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainSuffixTrie.java rename to Core/src/org/sleuthkit/autopsy/url/analytics/DomainSuffixTrie.java index 8beb5013c5..47633df46b 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainSuffixTrie.java +++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainSuffixTrie.java @@ -16,18 +16,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.sleuthkit.autopsy.recentactivity; +package org.sleuthkit.autopsy.url.analytics; +import com.google.common.annotations.Beta; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang.StringUtils; -import org.sleuthkit.autopsy.recentactivity.DomainCategoryProvider.DomainCategoryResult; -import org.sleuthkit.autopsy.recentactivity.Trie.TrieResult; +import org.sleuthkit.autopsy.url.analytics.Trie.TrieResult; +@Beta public class DomainSuffixTrie { + private static Iterable getSuffixIter(String host) { // parse the tokens splitting on delimiter List tokens = Stream.of(host.toLowerCase().split(DELIMITER)) @@ -44,24 +46,32 @@ public class DomainSuffixTrie { // delimiter when used with regex for domains private static final String DELIMITER = "\\" + JOINER; - - private final Trie trie = new Trie<>(); - void add(String suffix, String leaf) { + /** + * + * @param suffix + * @param leaf + */ + public void add(String suffix, String leaf) { this.trie.add(getSuffixIter(suffix), leaf); } /** * Determines if the host is a known type of host. If so, returns the * portion of the host suffix that signifies the domain type (i.e. - * "hotmail.com" or "mail.google.com") and the domain type. + * "hotmail.com" or "mail.google.com") and the domain type. Also returned in + * the DomainCategoryResult is whether or not any children of the found node + * in the trie and consequently, whether or not * * @param host The host. - * @return A pair of the host suffix and domain type for that suffix if + * @return The DomainCategoryResult if a portion of the suffix was found + * + * + * A pair of the host suffix and domain type for that suffix if * found. Otherwise, returns null. */ - DomainCategoryResult findHostCategory(String host) { + public DomainCategoryResult findHostCategory(String host) { // if no host, return none. if (StringUtils.isBlank(host)) { return null; @@ -71,6 +81,6 @@ public class DomainSuffixTrie { List keys = new ArrayList<>(result.getKeys()); Collections.reverse(keys); String suffix = String.join(JOINER, keys); - return new DomainCategoryResult(suffix, result.getValue(), result.hasChildren()); + return new DefaultDomainCategoryResult(suffix, result.getValue(), result.hasChildren()); } } diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/Trie.java b/Core/src/org/sleuthkit/autopsy/url/analytics/Trie.java similarity index 77% rename from RecentActivity/src/org/sleuthkit/autopsy/recentactivity/Trie.java rename to Core/src/org/sleuthkit/autopsy/url/analytics/Trie.java index 2ca1e33509..af7f9307b2 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/Trie.java +++ b/Core/src/org/sleuthkit/autopsy/url/analytics/Trie.java @@ -3,7 +3,7 @@ * To change this template file, choose Tools | Templates * and open the template in the editor. */ -package org.sleuthkit.autopsy.recentactivity; +package org.sleuthkit.autopsy.url.analytics; import java.util.ArrayList; import java.util.HashMap; @@ -11,15 +11,15 @@ import java.util.List; import java.util.Map; import org.apache.commons.collections4.MapUtils; -public class Trie { +class Trie { private class Node { - private final Map children = new HashMap<>(); + private final Map> children = new HashMap<>(); private V leafValue = null; - Node getOrAddChild(K childKey) { - Node child = children.get(childKey); + Node getOrAddChild(K childKey) { + Node child = children.get(childKey); if (child == null) { child = new Node(); children.put(childKey, child); @@ -28,7 +28,7 @@ public class Trie { return child; } - Node getChild(K childKey) { + Node getChild(K childKey) { return children.get(childKey); } @@ -42,35 +42,36 @@ public class Trie { } - public static class TrieResult { + static class TrieResult { private final V value; private final List keys; private final boolean hasChildren; - public TrieResult(V value, List keys, boolean hasChildren) { + TrieResult(V value, List keys, boolean hasChildren) { this.value = value; this.keys = keys; this.hasChildren = hasChildren; } - public V getValue() { + V getValue() { return value; } - public List getKeys() { + List getKeys() { return keys; } - public boolean hasChildren() { + boolean hasChildren() { return hasChildren; } } + private Node root = new Node<>(); - public void add(Iterable keyTokens, V leafValue) { - Node node = root; + void add(Iterable keyTokens, V leafValue) { + Node node = root; for (K key : keyTokens) { node = node.getOrAddChild(key); } @@ -78,7 +79,7 @@ public class Trie { node.setLeafValue(leafValue); } - public V getExact(Iterable keys) { + V getExact(Iterable keys) { Node node = root; for (K key : keys) { node = node.getChild(key); @@ -90,7 +91,7 @@ public class Trie { return node.getLeafValue(); } - public TrieResult getDeepest(Iterable keys) { + TrieResult getDeepest(Iterable keys) { Node node = root; List visited = new ArrayList<>(); TrieResult bestMatch = null; @@ -110,7 +111,7 @@ public class Trie { return bestMatch; } - public TrieResult getFirst(Iterable keys) { + TrieResult getFirst(Iterable keys) { Node node = root; List visited = new ArrayList<>(); for (K key : keys) { diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java index b96680f2fe..cea846dd8b 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java +++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java @@ -18,6 +18,7 @@ */ package org.sleuthkit.autopsy.recentactivity; +import org.sleuthkit.autopsy.url.analytics.DomainSuffixTrie; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -28,6 +29,8 @@ import org.apache.commons.lang.StringUtils; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.ingest.IngestModule; import org.sleuthkit.autopsy.ingest.IngestModule.IngestModuleException; +import org.sleuthkit.autopsy.url.analytics.DomainCategoryProvider; +import org.sleuthkit.autopsy.url.analytics.DomainCategoryResult; /** * The default domain category provider that makes use of the default csv @@ -40,7 +43,6 @@ class DefaultDomainCategoryProvider implements DomainCategoryProvider { private static final String DOMAIN_TYPE_CSV = "default_domain_categories.csv"; //NON-NLS private static final Logger logger = Logger.getLogger(DefaultDomainCategoryProvider.class.getName()); - /** * Loads the trie of suffixes from the csv resource file. * @@ -99,19 +101,21 @@ class DefaultDomainCategoryProvider implements DomainCategoryProvider { logger.log(Level.WARNING, String.format("Could not determine host suffix for this line: \"%s\" at line %d", line, lineNumber)); return; } - + trie.add(hostSuffix, domainTypeStr); } // the root node for the trie containing suffixes for domain categories. private DomainSuffixTrie trie = null; - + @Override public void initialize() throws IngestModuleException { - try { - this.trie = loadTrie(); - } catch (IOException ex) { - throw new IngestModule.IngestModuleException("Unable to load domain type csv for domain category analysis", ex); + if (this.trie == null) { + try { + this.trie = loadTrie(); + } catch (IOException ex) { + throw new IngestModule.IngestModuleException("Unable to load domain type csv for domain category analysis", ex); + } } } diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java index a9cd1cb8f2..c4857c7c6d 100644 --- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java +++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java @@ -22,20 +22,25 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; +import java.util.List; import java.util.logging.Level; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.commons.lang.StringUtils; -import org.apache.http.conn.util.DomainType; +import org.openide.util.Lookup; import org.openide.util.NbBundle.Messages; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.NetworkUtils; import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.ingest.IngestModule; -import org.sleuthkit.autopsy.recentactivity.DomainCategoryProvider.DomainCategoryResult; +import org.sleuthkit.autopsy.url.analytics.DomainCategoryProvider; +import org.sleuthkit.autopsy.url.analytics.DomainCategoryResult; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE; @@ -45,12 +50,12 @@ import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.TskCoreException; /** - * Analyzes a URL to determine if the url host is one of a certain kind of category - * (i.e. webmail, disposable mail). If found, a web category artifact is - * created. + * Analyzes a URL to determine if the url host is one of a certain kind of + * category (i.e. webmail, disposable mail). If found, a web category artifact + * is created. * - * CSV entries describing these domain types are compiled from sources. - * webmail: https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains + * CSV entries describing these domain types are compiled from sources. webmail: + * https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains * disposable mail: https://www.npmjs.com/package/disposable-email-domains */ @Messages({ @@ -60,7 +65,6 @@ import org.sleuthkit.datamodel.TskCoreException; }) class DomainCategorizer extends Extract { - // The url regex is based on the regex provided in https://tools.ietf.org/html/rfc3986#appendix-B // but expanded to be a little more flexible, and also properly parses user info and port in a url // this item has optional colon since some urls were coming through without the colon @@ -80,6 +84,7 @@ class DomainCategorizer extends Extract { private Content dataSource; private IngestJobContext context; + private List domainProviders = Collections.emptyList(); /** * Main constructor. @@ -117,8 +122,20 @@ class DomainCategorizer extends Extract { return host; } - + + private DomainCategoryResult findCategory(String domain, String host) { + List safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders; + for (DomainCategoryProvider provider : safeProviders) { + DomainCategoryResult result = provider.getCategory(domain, host); + if (result != null) { + return result; + } + } + + return null; + } + /** * Goes through web history artifacts and attempts to determine any hosts of * a domain type. If any are found, a TSK_WEB_CATEGORIZATION artifact is @@ -160,15 +177,21 @@ class DomainCategorizer extends Extract { // atempt to get the host from the url provided. String host = getHost(urlString); - if (StringUtils.isBlank(host)) { + + // get the url string from the artifact + BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN)); + String domainString = domainAttr.getValueString(); + + // make sure we have at least one of host or domain + if (StringUtils.isBlank(host) && StringUtils.isBlank(domainString)) { continue; } - + // if we reached this point, we are at least analyzing this item artifactsAnalyzed++; // attempt to get the domain type for the host using the suffix trie - DomainCategoryResult domainEntryFound = findHostSuffix(host); + DomainCategoryResult domainEntryFound = findCategory(host, domainString); if (domainEntryFound == null) { continue; } @@ -216,9 +239,36 @@ class DomainCategorizer extends Extract { this.findDomainTypes(); } + private static final Comparator PROVIDER_COMPARATOR + = (a, b) -> { + // if one item is the DefaultDomainCategoryProvider, and one is it, compare based on that. + int isDefaultCompare = Integer.compare( + a instanceof DefaultDomainCategoryProvider ? 0 : 1, + b instanceof DefaultDomainCategoryProvider ? 0 : 1); + + if (isDefaultCompare != 0) { + return isDefaultCompare; + } + + // otherwise, sort by the name of the fully qualified class for deterministic results. + return a.getClass().getName().compareToIgnoreCase(b.getClass().getName()); + }; + @Override void configExtractor() throws IngestModule.IngestModuleException { - // TODO lookup needs to go here + List foundProviders + = Lookup.getDefault().lookupAll(DomainCategoryProvider.class).stream() + .filter(provider -> provider != null) + .sorted(PROVIDER_COMPARATOR) + .collect(Collectors.toList()); + + for (DomainCategoryProvider provider : foundProviders) { + provider.initialize(); + } + + this.domainProviders = foundProviders == null ? + Collections.emptyList() : + foundProviders; } @Override