public Core package

This commit is contained in:
Greg DiCristofaro 2020-12-07 16:35:35 -05:00
parent 8c7155c3a1
commit fd5d759d2a
8 changed files with 251 additions and 74 deletions

View File

@ -336,6 +336,7 @@
<package>org.sleuthkit.autopsy.textextractors.configs</package>
<package>org.sleuthkit.autopsy.textsummarizer</package>
<package>org.sleuthkit.autopsy.texttranslation</package>
<package>org.sleuthkit.autopsy.url.analytics</package>
<package>org.sleuthkit.datamodel</package>
<package>org.sleuthkit.datamodel.blackboardutils</package>
<package>org.sleuthkit.datamodel.blackboardutils.attributes</package>

View File

@ -0,0 +1,72 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2020 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.url.analytics;
import com.google.common.annotations.Beta;
/**
* Default implementation of the DomainCategoryResult.
*/
@Beta
public class DefaultDomainCategoryResult implements DomainCategoryResult {
private final String hostSuffix;
private final String category;
private final boolean morePrefixes;
/**
* Default constructor assuming default for hasMorePrefixes of true.
* @param hostSuffix The portion of the suffix from the host or domain that was a
* match (i.e. 'mail.google.com' or 'hotmail.com').
* @param category The category (i.e. 'Web Email').
*/
public DefaultDomainCategoryResult(String hostSuffix, String category) {
this(hostSuffix, category, true);
}
/**
* Main constructor.
* @param hostSuffix The portion of the suffix from the host or domain that was a
* match (i.e. 'mail.google.com' or 'hotmail.com').
* @param category The category (i.e. 'Web Email').
* @param morePrefixes In the event that there would be different matches for additional
* prefixes, this can be true.
*/
public DefaultDomainCategoryResult(String hostSuffix, String category, boolean morePrefixes) {
this.hostSuffix = hostSuffix;
this.category = category;
this.morePrefixes = morePrefixes;
}
@Override
public String getHostSuffix() {
return hostSuffix;
}
@Override
public String getCategory() {
return category;
}
@Override
public boolean hasMorePrefixes() {
return morePrefixes;
}
}

View File

@ -16,38 +16,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.recentactivity;
package org.sleuthkit.autopsy.url.analytics;
import com.google.common.annotations.Beta;
import org.sleuthkit.autopsy.ingest.IngestModule;
/**
* Interface providing the category of a domain for creating
* TSK_WEB_CATEGORIZATION artifacts.
* Interface providing the category of a domain for the purposes of creating
* TSK_WEB_CATEGORIZATION artifacts. These implementations are used in
* RecentActivity as a part of the ingest process. Implementers of this class
* should have a no-argument constructor in order to be properly instantiated.
*/
@Beta
public interface DomainCategoryProvider {
public static class DomainCategoryResult {
private final String hostSuffix;
private final String category;
private final boolean hasChildren;
public DomainCategoryResult(String hostSuffix, String category, boolean hasChildren) {
this.hostSuffix = hostSuffix;
this.category = category;
this.hasChildren = hasChildren;
}
public String getHostSuffix() {
return hostSuffix;
}
public String getCategory() {
return category;
}
public boolean hasChildren() {
return hasChildren;
}
}
/**
* Provides the DomainCategory for a given domain/host or null if none can
@ -60,5 +41,13 @@ public interface DomainCategoryProvider {
*/
DomainCategoryResult getCategory(String domain, String host);
void initialize() throws IngestModule.IngestModuleException;
/**
* Initializes this provider in preparation to handle 'getCategory' requests
* during ingest. Conceivably, the same instance of this class may have this
* called multiple times and should handle that possibility gracefully.
*
* @throws IngestModule.IngestModuleException
*/
default void initialize() throws IngestModule.IngestModuleException {
}
}

View File

@ -0,0 +1,50 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2020 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.url.analytics;
import com.google.common.annotations.Beta;
/**
* The result of finding a match for the host or domain provided as an argument.
*/
@Beta
public interface DomainCategoryResult {
/**
* @return The portion of the suffix from the host or domain that was a
* match (i.e. 'mail.google.com' or 'hotmail.com').
*/
String getHostSuffix();
/**
* @return The category (i.e. 'Web Email').
*/
String getCategory();
/**
* @return In the event that there would be different matches for additional
* prefixes, this can return true. For instance, if there was an entry for
* 'mail.google.com' and 'chatenabled.mail.google.com', a search for
* 'mail.google.com' would return the host suffix: 'mail.google.com' and
* 'true' for hasMorePrefixes since an additional category could be added
* for the 'chatenabled' prefix.
*/
default boolean hasMorePrefixes() {
return true;
}
}

View File

@ -16,18 +16,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.recentactivity;
package org.sleuthkit.autopsy.url.analytics;
import com.google.common.annotations.Beta;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang.StringUtils;
import org.sleuthkit.autopsy.recentactivity.DomainCategoryProvider.DomainCategoryResult;
import org.sleuthkit.autopsy.recentactivity.Trie.TrieResult;
import org.sleuthkit.autopsy.url.analytics.Trie.TrieResult;
@Beta
public class DomainSuffixTrie {
private static Iterable<String> getSuffixIter(String host) {
// parse the tokens splitting on delimiter
List<String> tokens = Stream.of(host.toLowerCase().split(DELIMITER))
@ -44,24 +46,32 @@ public class DomainSuffixTrie {
// delimiter when used with regex for domains
private static final String DELIMITER = "\\" + JOINER;
private final Trie<String, String> trie = new Trie<>();
void add(String suffix, String leaf) {
/**
*
* @param suffix
* @param leaf
*/
public void add(String suffix, String leaf) {
this.trie.add(getSuffixIter(suffix), leaf);
}
/**
* Determines if the host is a known type of host. If so, returns the
* portion of the host suffix that signifies the domain type (i.e.
* "hotmail.com" or "mail.google.com") and the domain type.
* "hotmail.com" or "mail.google.com") and the domain type. Also returned in
* the DomainCategoryResult is whether or not any children of the found node
* in the trie and consequently, whether or not
*
* @param host The host.
* @return A pair of the host suffix and domain type for that suffix if
* @return The DomainCategoryResult if a portion of the suffix was found
*
*
* A pair of the host suffix and domain type for that suffix if
* found. Otherwise, returns null.
*/
DomainCategoryResult findHostCategory(String host) {
public DomainCategoryResult findHostCategory(String host) {
// if no host, return none.
if (StringUtils.isBlank(host)) {
return null;
@ -71,6 +81,6 @@ public class DomainSuffixTrie {
List<String> keys = new ArrayList<>(result.getKeys());
Collections.reverse(keys);
String suffix = String.join(JOINER, keys);
return new DomainCategoryResult(suffix, result.getValue(), result.hasChildren());
return new DefaultDomainCategoryResult(suffix, result.getValue(), result.hasChildren());
}
}

View File

@ -3,7 +3,7 @@
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package org.sleuthkit.autopsy.recentactivity;
package org.sleuthkit.autopsy.url.analytics;
import java.util.ArrayList;
import java.util.HashMap;
@ -11,15 +11,15 @@ import java.util.List;
import java.util.Map;
import org.apache.commons.collections4.MapUtils;
public class Trie<K, V> {
class Trie<K, V> {
private class Node<K, V> {
private final Map<K, Node> children = new HashMap<>();
private final Map<K, Node<K, V>> children = new HashMap<>();
private V leafValue = null;
Node getOrAddChild(K childKey) {
Node child = children.get(childKey);
Node<K, V> getOrAddChild(K childKey) {
Node<K, V> child = children.get(childKey);
if (child == null) {
child = new Node();
children.put(childKey, child);
@ -28,7 +28,7 @@ public class Trie<K, V> {
return child;
}
Node getChild(K childKey) {
Node<K, V> getChild(K childKey) {
return children.get(childKey);
}
@ -42,35 +42,36 @@ public class Trie<K, V> {
}
public static class TrieResult<K, V> {
static class TrieResult<K, V> {
private final V value;
private final List<K> keys;
private final boolean hasChildren;
public TrieResult(V value, List<K> keys, boolean hasChildren) {
TrieResult(V value, List<K> keys, boolean hasChildren) {
this.value = value;
this.keys = keys;
this.hasChildren = hasChildren;
}
public V getValue() {
V getValue() {
return value;
}
public List<K> getKeys() {
List<K> getKeys() {
return keys;
}
public boolean hasChildren() {
boolean hasChildren() {
return hasChildren;
}
}
private Node<K, V> root = new Node<>();
public void add(Iterable<K> keyTokens, V leafValue) {
Node node = root;
void add(Iterable<K> keyTokens, V leafValue) {
Node<K, V> node = root;
for (K key : keyTokens) {
node = node.getOrAddChild(key);
}
@ -78,7 +79,7 @@ public class Trie<K, V> {
node.setLeafValue(leafValue);
}
public V getExact(Iterable<K> keys) {
V getExact(Iterable<K> keys) {
Node<K, V> node = root;
for (K key : keys) {
node = node.getChild(key);
@ -90,7 +91,7 @@ public class Trie<K, V> {
return node.getLeafValue();
}
public TrieResult<K, V> getDeepest(Iterable<K> keys) {
TrieResult<K, V> getDeepest(Iterable<K> keys) {
Node<K, V> node = root;
List<K> visited = new ArrayList<>();
TrieResult<K, V> bestMatch = null;
@ -110,7 +111,7 @@ public class Trie<K, V> {
return bestMatch;
}
public TrieResult<K, V> getFirst(Iterable<K> keys) {
TrieResult<K, V> getFirst(Iterable<K> keys) {
Node<K, V> node = root;
List<K> visited = new ArrayList<>();
for (K key : keys) {

View File

@ -18,6 +18,7 @@
*/
package org.sleuthkit.autopsy.recentactivity;
import org.sleuthkit.autopsy.url.analytics.DomainSuffixTrie;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
@ -28,6 +29,8 @@ import org.apache.commons.lang.StringUtils;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.ingest.IngestModule;
import org.sleuthkit.autopsy.ingest.IngestModule.IngestModuleException;
import org.sleuthkit.autopsy.url.analytics.DomainCategoryProvider;
import org.sleuthkit.autopsy.url.analytics.DomainCategoryResult;
/**
* The default domain category provider that makes use of the default csv
@ -40,7 +43,6 @@ class DefaultDomainCategoryProvider implements DomainCategoryProvider {
private static final String DOMAIN_TYPE_CSV = "default_domain_categories.csv"; //NON-NLS
private static final Logger logger = Logger.getLogger(DefaultDomainCategoryProvider.class.getName());
/**
* Loads the trie of suffixes from the csv resource file.
*
@ -108,10 +110,12 @@ class DefaultDomainCategoryProvider implements DomainCategoryProvider {
@Override
public void initialize() throws IngestModuleException {
try {
this.trie = loadTrie();
} catch (IOException ex) {
throw new IngestModule.IngestModuleException("Unable to load domain type csv for domain category analysis", ex);
if (this.trie == null) {
try {
this.trie = loadTrie();
} catch (IOException ex) {
throw new IngestModule.IngestModuleException("Unable to load domain type csv for domain category analysis", ex);
}
}
}

View File

@ -22,20 +22,25 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.logging.Level;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import org.apache.http.conn.util.DomainType;
import org.openide.util.Lookup;
import org.openide.util.NbBundle.Messages;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.NetworkUtils;
import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.ingest.IngestModule;
import org.sleuthkit.autopsy.recentactivity.DomainCategoryProvider.DomainCategoryResult;
import org.sleuthkit.autopsy.url.analytics.DomainCategoryProvider;
import org.sleuthkit.autopsy.url.analytics.DomainCategoryResult;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
@ -45,12 +50,12 @@ import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Analyzes a URL to determine if the url host is one of a certain kind of category
* (i.e. webmail, disposable mail). If found, a web category artifact is
* created.
* Analyzes a URL to determine if the url host is one of a certain kind of
* category (i.e. webmail, disposable mail). If found, a web category artifact
* is created.
*
* CSV entries describing these domain types are compiled from sources.
* webmail: https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains
* CSV entries describing these domain types are compiled from sources. webmail:
* https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains
* disposable mail: https://www.npmjs.com/package/disposable-email-domains
*/
@Messages({
@ -60,7 +65,6 @@ import org.sleuthkit.datamodel.TskCoreException;
})
class DomainCategorizer extends Extract {
// The url regex is based on the regex provided in https://tools.ietf.org/html/rfc3986#appendix-B
// but expanded to be a little more flexible, and also properly parses user info and port in a url
// this item has optional colon since some urls were coming through without the colon
@ -80,6 +84,7 @@ class DomainCategorizer extends Extract {
private Content dataSource;
private IngestJobContext context;
private List<DomainCategoryProvider> domainProviders = Collections.emptyList();
/**
* Main constructor.
@ -119,6 +124,18 @@ class DomainCategorizer extends Extract {
}
private DomainCategoryResult findCategory(String domain, String host) {
List<DomainCategoryProvider> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
for (DomainCategoryProvider provider : safeProviders) {
DomainCategoryResult result = provider.getCategory(domain, host);
if (result != null) {
return result;
}
}
return null;
}
/**
* Goes through web history artifacts and attempts to determine any hosts of
* a domain type. If any are found, a TSK_WEB_CATEGORIZATION artifact is
@ -160,7 +177,13 @@ class DomainCategorizer extends Extract {
// atempt to get the host from the url provided.
String host = getHost(urlString);
if (StringUtils.isBlank(host)) {
// get the url string from the artifact
BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
String domainString = domainAttr.getValueString();
// make sure we have at least one of host or domain
if (StringUtils.isBlank(host) && StringUtils.isBlank(domainString)) {
continue;
}
@ -168,7 +191,7 @@ class DomainCategorizer extends Extract {
artifactsAnalyzed++;
// attempt to get the domain type for the host using the suffix trie
DomainCategoryResult domainEntryFound = findHostSuffix(host);
DomainCategoryResult domainEntryFound = findCategory(host, domainString);
if (domainEntryFound == null) {
continue;
}
@ -216,9 +239,36 @@ class DomainCategorizer extends Extract {
this.findDomainTypes();
}
private static final Comparator<DomainCategoryProvider> PROVIDER_COMPARATOR
= (a, b) -> {
// if one item is the DefaultDomainCategoryProvider, and one is it, compare based on that.
int isDefaultCompare = Integer.compare(
a instanceof DefaultDomainCategoryProvider ? 0 : 1,
b instanceof DefaultDomainCategoryProvider ? 0 : 1);
if (isDefaultCompare != 0) {
return isDefaultCompare;
}
// otherwise, sort by the name of the fully qualified class for deterministic results.
return a.getClass().getName().compareToIgnoreCase(b.getClass().getName());
};
@Override
void configExtractor() throws IngestModule.IngestModuleException {
// TODO lookup needs to go here
List<DomainCategoryProvider> foundProviders
= Lookup.getDefault().lookupAll(DomainCategoryProvider.class).stream()
.filter(provider -> provider != null)
.sorted(PROVIDER_COMPARATOR)
.collect(Collectors.toList());
for (DomainCategoryProvider provider : foundProviders) {
provider.initialize();
}
this.domainProviders = foundProviders == null ?
Collections.emptyList() :
foundProviders;
}
@Override