diff --git a/Core/nbproject/project.xml b/Core/nbproject/project.xml
index 8d4d9b8c98..074d930725 100644
--- a/Core/nbproject/project.xml
+++ b/Core/nbproject/project.xml
@@ -336,6 +336,7 @@
org.sleuthkit.autopsy.textextractors.configs
org.sleuthkit.autopsy.textsummarizer
org.sleuthkit.autopsy.texttranslation
+ org.sleuthkit.autopsy.url.analytics
org.sleuthkit.datamodel
org.sleuthkit.datamodel.blackboardutils
org.sleuthkit.datamodel.blackboardutils.attributes
diff --git a/Core/src/org/sleuthkit/autopsy/url/analytics/DefaultDomainCategoryResult.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DefaultDomainCategoryResult.java
new file mode 100644
index 0000000000..e0792cd375
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DefaultDomainCategoryResult.java
@@ -0,0 +1,72 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2020 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.url.analytics;
+
+import com.google.common.annotations.Beta;
+
+/**
+ * Default implementation of the DomainCategoryResult.
+ */
+@Beta
+public class DefaultDomainCategoryResult implements DomainCategoryResult {
+
+ private final String hostSuffix;
+ private final String category;
+ private final boolean morePrefixes;
+
+ /**
+ * Default constructor assuming default for hasMorePrefixes of true.
+ * @param hostSuffix The portion of the suffix from the host or domain that was a
+ * match (i.e. 'mail.google.com' or 'hotmail.com').
+ * @param category The category (i.e. 'Web Email').
+ */
+ public DefaultDomainCategoryResult(String hostSuffix, String category) {
+ this(hostSuffix, category, true);
+ }
+
+ /**
+ * Main constructor.
+ * @param hostSuffix The portion of the suffix from the host or domain that was a
+ * match (i.e. 'mail.google.com' or 'hotmail.com').
+ * @param category The category (i.e. 'Web Email').
+ * @param morePrefixes In the event that there would be different matches for additional
+ * prefixes, this can be true.
+ */
+ public DefaultDomainCategoryResult(String hostSuffix, String category, boolean morePrefixes) {
+ this.hostSuffix = hostSuffix;
+ this.category = category;
+ this.morePrefixes = morePrefixes;
+ }
+
+ @Override
+ public String getHostSuffix() {
+ return hostSuffix;
+ }
+
+ @Override
+ public String getCategory() {
+ return category;
+ }
+
+ @Override
+ public boolean hasMorePrefixes() {
+ return morePrefixes;
+ }
+
+}
diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategoryProvider.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryProvider.java
similarity index 57%
rename from RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategoryProvider.java
rename to Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryProvider.java
index dc9c215bfc..072119ce4c 100644
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategoryProvider.java
+++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryProvider.java
@@ -16,38 +16,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.recentactivity;
+package org.sleuthkit.autopsy.url.analytics;
+import com.google.common.annotations.Beta;
import org.sleuthkit.autopsy.ingest.IngestModule;
/**
- * Interface providing the category of a domain for creating
- * TSK_WEB_CATEGORIZATION artifacts.
+ * Interface providing the category of a domain for the purposes of creating
+ * TSK_WEB_CATEGORIZATION artifacts. These implementations are used in
+ * RecentActivity as a part of the ingest process. Implementers of this class
+ * should have a no-argument constructor in order to be properly instantiated.
*/
+@Beta
public interface DomainCategoryProvider {
- public static class DomainCategoryResult {
- private final String hostSuffix;
- private final String category;
- private final boolean hasChildren;
-
- public DomainCategoryResult(String hostSuffix, String category, boolean hasChildren) {
- this.hostSuffix = hostSuffix;
- this.category = category;
- this.hasChildren = hasChildren;
- }
-
- public String getHostSuffix() {
- return hostSuffix;
- }
-
- public String getCategory() {
- return category;
- }
-
- public boolean hasChildren() {
- return hasChildren;
- }
- }
/**
* Provides the DomainCategory for a given domain/host or null if none can
@@ -59,6 +40,14 @@ public interface DomainCategoryProvider {
* null if not.
*/
DomainCategoryResult getCategory(String domain, String host);
-
- void initialize() throws IngestModule.IngestModuleException;
+
+ /**
+ * Initializes this provider in preparation to handle 'getCategory' requests
+ * during ingest. Conceivably, the same instance of this class may have this
+ * called multiple times and should handle that possibility gracefully.
+ *
+ * @throws IngestModule.IngestModuleException
+ */
+ default void initialize() throws IngestModule.IngestModuleException {
+ }
}
diff --git a/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryResult.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryResult.java
new file mode 100644
index 0000000000..c52780a587
--- /dev/null
+++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainCategoryResult.java
@@ -0,0 +1,50 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2020 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.url.analytics;
+
+import com.google.common.annotations.Beta;
+
+/**
+ * The result of finding a match for the host or domain provided as an argument.
+ */
+@Beta
+public interface DomainCategoryResult {
+ /**
+ * @return The portion of the suffix from the host or domain that was a
+ * match (i.e. 'mail.google.com' or 'hotmail.com').
+ */
+ String getHostSuffix();
+
+ /**
+ * @return The category (i.e. 'Web Email').
+ */
+ String getCategory();
+
+ /**
+ * @return In the event that there would be different matches for additional
+ * prefixes, this can return true. For instance, if there was an entry for
+ * 'mail.google.com' and 'chatenabled.mail.google.com', a search for
+ * 'mail.google.com' would return the host suffix: 'mail.google.com' and
+ * 'true' for hasMorePrefixes since an additional category could be added
+ * for the 'chatenabled' prefix.
+ */
+ default boolean hasMorePrefixes() {
+ return true;
+ }
+}
diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainSuffixTrie.java b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainSuffixTrie.java
similarity index 73%
rename from RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainSuffixTrie.java
rename to Core/src/org/sleuthkit/autopsy/url/analytics/DomainSuffixTrie.java
index 8beb5013c5..47633df46b 100644
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainSuffixTrie.java
+++ b/Core/src/org/sleuthkit/autopsy/url/analytics/DomainSuffixTrie.java
@@ -16,18 +16,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.recentactivity;
+package org.sleuthkit.autopsy.url.analytics;
+import com.google.common.annotations.Beta;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang.StringUtils;
-import org.sleuthkit.autopsy.recentactivity.DomainCategoryProvider.DomainCategoryResult;
-import org.sleuthkit.autopsy.recentactivity.Trie.TrieResult;
+import org.sleuthkit.autopsy.url.analytics.Trie.TrieResult;
+@Beta
public class DomainSuffixTrie {
+
private static Iterable getSuffixIter(String host) {
// parse the tokens splitting on delimiter
List tokens = Stream.of(host.toLowerCase().split(DELIMITER))
@@ -44,24 +46,32 @@ public class DomainSuffixTrie {
// delimiter when used with regex for domains
private static final String DELIMITER = "\\" + JOINER;
-
-
private final Trie trie = new Trie<>();
- void add(String suffix, String leaf) {
+ /**
+ *
+ * @param suffix
+ * @param leaf
+ */
+ public void add(String suffix, String leaf) {
this.trie.add(getSuffixIter(suffix), leaf);
}
/**
* Determines if the host is a known type of host. If so, returns the
* portion of the host suffix that signifies the domain type (i.e.
- * "hotmail.com" or "mail.google.com") and the domain type.
+ * "hotmail.com" or "mail.google.com") and the domain type. Also returned in
+ * the DomainCategoryResult is whether or not any children of the found node
+ * in the trie and consequently, whether or not
*
* @param host The host.
- * @return A pair of the host suffix and domain type for that suffix if
+ * @return The DomainCategoryResult if a portion of the suffix was found
+ *
+ *
+ * A pair of the host suffix and domain type for that suffix if
* found. Otherwise, returns null.
*/
- DomainCategoryResult findHostCategory(String host) {
+ public DomainCategoryResult findHostCategory(String host) {
// if no host, return none.
if (StringUtils.isBlank(host)) {
return null;
@@ -71,6 +81,6 @@ public class DomainSuffixTrie {
List keys = new ArrayList<>(result.getKeys());
Collections.reverse(keys);
String suffix = String.join(JOINER, keys);
- return new DomainCategoryResult(suffix, result.getValue(), result.hasChildren());
+ return new DefaultDomainCategoryResult(suffix, result.getValue(), result.hasChildren());
}
}
diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/Trie.java b/Core/src/org/sleuthkit/autopsy/url/analytics/Trie.java
similarity index 77%
rename from RecentActivity/src/org/sleuthkit/autopsy/recentactivity/Trie.java
rename to Core/src/org/sleuthkit/autopsy/url/analytics/Trie.java
index 2ca1e33509..af7f9307b2 100644
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/Trie.java
+++ b/Core/src/org/sleuthkit/autopsy/url/analytics/Trie.java
@@ -3,7 +3,7 @@
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
-package org.sleuthkit.autopsy.recentactivity;
+package org.sleuthkit.autopsy.url.analytics;
import java.util.ArrayList;
import java.util.HashMap;
@@ -11,15 +11,15 @@ import java.util.List;
import java.util.Map;
import org.apache.commons.collections4.MapUtils;
-public class Trie {
+class Trie {
private class Node {
- private final Map children = new HashMap<>();
+ private final Map> children = new HashMap<>();
private V leafValue = null;
- Node getOrAddChild(K childKey) {
- Node child = children.get(childKey);
+ Node getOrAddChild(K childKey) {
+ Node child = children.get(childKey);
if (child == null) {
child = new Node();
children.put(childKey, child);
@@ -28,7 +28,7 @@ public class Trie {
return child;
}
- Node getChild(K childKey) {
+ Node getChild(K childKey) {
return children.get(childKey);
}
@@ -42,35 +42,36 @@ public class Trie {
}
- public static class TrieResult {
+ static class TrieResult {
private final V value;
private final List keys;
private final boolean hasChildren;
- public TrieResult(V value, List keys, boolean hasChildren) {
+ TrieResult(V value, List keys, boolean hasChildren) {
this.value = value;
this.keys = keys;
this.hasChildren = hasChildren;
}
- public V getValue() {
+ V getValue() {
return value;
}
- public List getKeys() {
+ List getKeys() {
return keys;
}
- public boolean hasChildren() {
+ boolean hasChildren() {
return hasChildren;
}
}
+
private Node root = new Node<>();
- public void add(Iterable keyTokens, V leafValue) {
- Node node = root;
+ void add(Iterable keyTokens, V leafValue) {
+ Node node = root;
for (K key : keyTokens) {
node = node.getOrAddChild(key);
}
@@ -78,7 +79,7 @@ public class Trie {
node.setLeafValue(leafValue);
}
- public V getExact(Iterable keys) {
+ V getExact(Iterable keys) {
Node node = root;
for (K key : keys) {
node = node.getChild(key);
@@ -90,7 +91,7 @@ public class Trie {
return node.getLeafValue();
}
- public TrieResult getDeepest(Iterable keys) {
+ TrieResult getDeepest(Iterable keys) {
Node node = root;
List visited = new ArrayList<>();
TrieResult bestMatch = null;
@@ -110,7 +111,7 @@ public class Trie {
return bestMatch;
}
- public TrieResult getFirst(Iterable keys) {
+ TrieResult getFirst(Iterable keys) {
Node node = root;
List visited = new ArrayList<>();
for (K key : keys) {
diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java
index b96680f2fe..cea846dd8b 100644
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java
+++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DefaultDomainCategoryProvider.java
@@ -18,6 +18,7 @@
*/
package org.sleuthkit.autopsy.recentactivity;
+import org.sleuthkit.autopsy.url.analytics.DomainSuffixTrie;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
@@ -28,6 +29,8 @@ import org.apache.commons.lang.StringUtils;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.ingest.IngestModule;
import org.sleuthkit.autopsy.ingest.IngestModule.IngestModuleException;
+import org.sleuthkit.autopsy.url.analytics.DomainCategoryProvider;
+import org.sleuthkit.autopsy.url.analytics.DomainCategoryResult;
/**
* The default domain category provider that makes use of the default csv
@@ -40,7 +43,6 @@ class DefaultDomainCategoryProvider implements DomainCategoryProvider {
private static final String DOMAIN_TYPE_CSV = "default_domain_categories.csv"; //NON-NLS
private static final Logger logger = Logger.getLogger(DefaultDomainCategoryProvider.class.getName());
-
/**
* Loads the trie of suffixes from the csv resource file.
*
@@ -99,19 +101,21 @@ class DefaultDomainCategoryProvider implements DomainCategoryProvider {
logger.log(Level.WARNING, String.format("Could not determine host suffix for this line: \"%s\" at line %d", line, lineNumber));
return;
}
-
+
trie.add(hostSuffix, domainTypeStr);
}
// the root node for the trie containing suffixes for domain categories.
private DomainSuffixTrie trie = null;
-
+
@Override
public void initialize() throws IngestModuleException {
- try {
- this.trie = loadTrie();
- } catch (IOException ex) {
- throw new IngestModule.IngestModuleException("Unable to load domain type csv for domain category analysis", ex);
+ if (this.trie == null) {
+ try {
+ this.trie = loadTrie();
+ } catch (IOException ex) {
+ throw new IngestModule.IngestModuleException("Unable to load domain type csv for domain category analysis", ex);
+ }
}
}
diff --git a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java
index a9cd1cb8f2..c4857c7c6d 100644
--- a/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java
+++ b/RecentActivity/src/org/sleuthkit/autopsy/recentactivity/DomainCategorizer.java
@@ -22,20 +22,25 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
import java.util.HashSet;
+import java.util.List;
import java.util.logging.Level;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
-import org.apache.http.conn.util.DomainType;
+import org.openide.util.Lookup;
import org.openide.util.NbBundle.Messages;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.NetworkUtils;
import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.ingest.IngestModule;
-import org.sleuthkit.autopsy.recentactivity.DomainCategoryProvider.DomainCategoryResult;
+import org.sleuthkit.autopsy.url.analytics.DomainCategoryProvider;
+import org.sleuthkit.autopsy.url.analytics.DomainCategoryResult;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
@@ -45,12 +50,12 @@ import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
/**
- * Analyzes a URL to determine if the url host is one of a certain kind of category
- * (i.e. webmail, disposable mail). If found, a web category artifact is
- * created.
+ * Analyzes a URL to determine if the url host is one of a certain kind of
+ * category (i.e. webmail, disposable mail). If found, a web category artifact
+ * is created.
*
- * CSV entries describing these domain types are compiled from sources.
- * webmail: https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains
+ * CSV entries describing these domain types are compiled from sources. webmail:
+ * https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains
* disposable mail: https://www.npmjs.com/package/disposable-email-domains
*/
@Messages({
@@ -60,7 +65,6 @@ import org.sleuthkit.datamodel.TskCoreException;
})
class DomainCategorizer extends Extract {
-
// The url regex is based on the regex provided in https://tools.ietf.org/html/rfc3986#appendix-B
// but expanded to be a little more flexible, and also properly parses user info and port in a url
// this item has optional colon since some urls were coming through without the colon
@@ -80,6 +84,7 @@ class DomainCategorizer extends Extract {
private Content dataSource;
private IngestJobContext context;
+ private List domainProviders = Collections.emptyList();
/**
* Main constructor.
@@ -117,8 +122,20 @@ class DomainCategorizer extends Extract {
return host;
}
-
+
+ private DomainCategoryResult findCategory(String domain, String host) {
+ List safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
+ for (DomainCategoryProvider provider : safeProviders) {
+ DomainCategoryResult result = provider.getCategory(domain, host);
+ if (result != null) {
+ return result;
+ }
+ }
+
+ return null;
+ }
+
/**
* Goes through web history artifacts and attempts to determine any hosts of
* a domain type. If any are found, a TSK_WEB_CATEGORIZATION artifact is
@@ -160,15 +177,21 @@ class DomainCategorizer extends Extract {
// atempt to get the host from the url provided.
String host = getHost(urlString);
- if (StringUtils.isBlank(host)) {
+
+ // get the url string from the artifact
+ BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
+ String domainString = domainAttr.getValueString();
+
+ // make sure we have at least one of host or domain
+ if (StringUtils.isBlank(host) && StringUtils.isBlank(domainString)) {
continue;
}
-
+
// if we reached this point, we are at least analyzing this item
artifactsAnalyzed++;
// attempt to get the domain type for the host using the suffix trie
- DomainCategoryResult domainEntryFound = findHostSuffix(host);
+ DomainCategoryResult domainEntryFound = findCategory(host, domainString);
if (domainEntryFound == null) {
continue;
}
@@ -216,9 +239,36 @@ class DomainCategorizer extends Extract {
this.findDomainTypes();
}
+ private static final Comparator PROVIDER_COMPARATOR
+ = (a, b) -> {
+ // if one item is the DefaultDomainCategoryProvider, and one is it, compare based on that.
+ int isDefaultCompare = Integer.compare(
+ a instanceof DefaultDomainCategoryProvider ? 0 : 1,
+ b instanceof DefaultDomainCategoryProvider ? 0 : 1);
+
+ if (isDefaultCompare != 0) {
+ return isDefaultCompare;
+ }
+
+ // otherwise, sort by the name of the fully qualified class for deterministic results.
+ return a.getClass().getName().compareToIgnoreCase(b.getClass().getName());
+ };
+
@Override
void configExtractor() throws IngestModule.IngestModuleException {
- // TODO lookup needs to go here
+ List foundProviders
+ = Lookup.getDefault().lookupAll(DomainCategoryProvider.class).stream()
+ .filter(provider -> provider != null)
+ .sorted(PROVIDER_COMPARATOR)
+ .collect(Collectors.toList());
+
+ for (DomainCategoryProvider provider : foundProviders) {
+ provider.initialize();
+ }
+
+ this.domainProviders = foundProviders == null ?
+ Collections.emptyList() :
+ foundProviders;
}
@Override