mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-13 08:26:15 +00:00
commenting
This commit is contained in:
parent
b7baa5e2c8
commit
03a23f636b
@ -39,11 +39,13 @@ import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.openide.util.Exceptions;
|
||||
import org.openide.util.NbBundle.Messages;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.NetworkUtils;
|
||||
import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.autopsy.ingest.IngestModule;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
|
||||
@ -52,6 +54,11 @@ import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
* Analyzes a URL to determine if the url host is one that handles messages
|
||||
* (i.e. webmail, disposable mail). If found, a domain category type artifact is
|
||||
* created.
|
||||
*/
|
||||
@Messages({
|
||||
"MessageURLAnalyzer_moduleName_text=MessageURLAnalyzer",
|
||||
"MessageURLAnalyzer_Progress_Message_Find_Message_URLs=Finding Messaging Domains",
|
||||
@ -59,6 +66,9 @@ import org.sleuthkit.datamodel.TskCoreException;
|
||||
})
|
||||
class MessageURLAnalyzer extends Extract {
|
||||
|
||||
/**
|
||||
* The message service type (i.e. webmail, disposable mail).
|
||||
*/
|
||||
@Messages({
|
||||
"MessageType_disposableMail_displayName=Disposable Email",
|
||||
"MessageType_webmail_displayName=Web Email"
|
||||
@ -70,25 +80,51 @@ class MessageURLAnalyzer extends Extract {
|
||||
private final String csvId;
|
||||
private final String attrDisplayName;
|
||||
|
||||
/**
|
||||
* Main constructor.
|
||||
*
|
||||
* @param csvId The identifier within the csv for this type.
|
||||
* @param attrDisplayName The display name in the artifact for this
|
||||
* domain category.
|
||||
*/
|
||||
private MessageType(String csvId, String attrDisplayName) {
|
||||
this.csvId = csvId;
|
||||
this.attrDisplayName = attrDisplayName;
|
||||
}
|
||||
|
||||
public String getCsvId() {
|
||||
/**
|
||||
* @return The identifier within the csv for this type.
|
||||
*/
|
||||
String getCsvId() {
|
||||
return csvId;
|
||||
}
|
||||
|
||||
public String getAttrDisplayName() {
|
||||
/**
|
||||
* @return The display name in the artifact for this domain category.
|
||||
*/
|
||||
String getAttrDisplayName() {
|
||||
return attrDisplayName;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A node in the trie indicating a domain suffix token. For instance, the
|
||||
* csv entry: "hotmail.com,webmail" would get parsed to a node, "com" having
|
||||
* a child of "hotmail". That child node, as a leaf, would have a webmail
|
||||
* message type.
|
||||
*/
|
||||
private static class MessageDomainTrieNode {
|
||||
|
||||
private final Map<String, MessageDomainTrieNode> children = new HashMap<>();
|
||||
private MessageType messageType = null;
|
||||
|
||||
/**
|
||||
* Retrieves the child node of the given key. If that child key does not
|
||||
* exist, a child node of that key is created and returned.
|
||||
*
|
||||
* @param childKey The key for the child (i.e. "com").
|
||||
* @return The retrieved or newly created child node.
|
||||
*/
|
||||
MessageDomainTrieNode getOrAddChild(String childKey) {
|
||||
MessageDomainTrieNode child = children.get(childKey);
|
||||
if (child == null) {
|
||||
@ -99,19 +135,40 @@ class MessageURLAnalyzer extends Extract {
|
||||
return child;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the child node of the given key or returns null if child
|
||||
* does not exist.
|
||||
*
|
||||
* @param childKey The key for the child node (i.e. "com").
|
||||
* @return The child node or null if it does not exist.
|
||||
*/
|
||||
MessageDomainTrieNode getChild(String childKey) {
|
||||
return children.get(childKey);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return If this is a leaf node, the type of message for this node.
|
||||
*/
|
||||
MessageType getMessageType() {
|
||||
return messageType;
|
||||
}
|
||||
|
||||
/**
|
||||
* If this is a leaf node, this sets the message type for this node.
|
||||
*
|
||||
* @param messageType The message type for this leaf node.
|
||||
*/
|
||||
void setMessageType(MessageType messageType) {
|
||||
this.messageType = messageType;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the trie of suffixes from the csv resource file.
|
||||
*
|
||||
* @return The root trie node.
|
||||
* @throws IOException
|
||||
*/
|
||||
private static MessageDomainTrieNode loadTrie() throws IOException {
|
||||
try (InputStream is = MessageURLAnalyzer.class.getResourceAsStream(MESSAGE_TYPE_CSV);
|
||||
InputStreamReader isReader = new InputStreamReader(is, StandardCharsets.UTF_8);
|
||||
@ -131,13 +188,27 @@ class MessageURLAnalyzer extends Extract {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a trie node based on the csv line.
|
||||
*
|
||||
* @param trie The root trie node.
|
||||
* @param line The line to be parsed.
|
||||
* @param lineNumber The line number of this csv line.
|
||||
*/
|
||||
private static void addItem(MessageDomainTrieNode trie, String line, int lineNumber) {
|
||||
// make sure this isn't a blank line.
|
||||
if (StringUtils.isBlank(line)) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] csvItems = line.split(CSV_DELIMITER);
|
||||
// line should be a key value pair
|
||||
if (csvItems.length < 2) {
|
||||
logger.log(Level.WARNING, String.format("Unable to properly parse line of \"%s\" at line %d", line, lineNumber));
|
||||
return;
|
||||
}
|
||||
|
||||
// determine the message type from the value, and return if can't be determined.
|
||||
String messageTypeStr = csvItems[1].trim();
|
||||
|
||||
MessageType messageType = (StringUtils.isNotBlank(messageTypeStr))
|
||||
@ -152,6 +223,7 @@ class MessageURLAnalyzer extends Extract {
|
||||
return;
|
||||
}
|
||||
|
||||
// gather the domainSuffix and parse into domain trie tokens
|
||||
String domainSuffix = csvItems[0];
|
||||
if (StringUtils.isBlank(domainSuffix)) {
|
||||
logger.log(Level.WARNING, String.format("Could not determine domain suffix for this line: \"%s\" at line %d", line, lineNumber));
|
||||
@ -160,6 +232,7 @@ class MessageURLAnalyzer extends Extract {
|
||||
|
||||
String[] domainTokens = domainSuffix.trim().toLowerCase().split(DELIMITER);
|
||||
|
||||
// add into the trie
|
||||
MessageDomainTrieNode node = trie;
|
||||
for (int i = domainTokens.length - 1; i >= 0; i--) {
|
||||
String token = domainTokens[i];
|
||||
@ -174,23 +247,6 @@ class MessageURLAnalyzer extends Extract {
|
||||
|
||||
}
|
||||
|
||||
private static MessageDomainTrieNode trieSingleton = null;
|
||||
private static final Object trieLock = new Object();
|
||||
|
||||
private static MessageDomainTrieNode getTrie() {
|
||||
synchronized (trieLock) {
|
||||
if (trieSingleton == null) {
|
||||
try {
|
||||
trieSingleton = loadTrie();
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.SEVERE, "Unable to load message domain csv", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return trieSingleton;
|
||||
}
|
||||
|
||||
// Character for joining domain segments.
|
||||
private static final String JOINER = ".";
|
||||
// delimiter when used with regex for domains
|
||||
@ -218,19 +274,30 @@ class MessageURLAnalyzer extends Extract {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(MessageURLAnalyzer.class.getName());
|
||||
|
||||
private final MessageDomainTrieNode rootTrie;
|
||||
// the root node for the trie containing suffixes for domain categories.
|
||||
private MessageDomainTrieNode rootTrie = null;
|
||||
|
||||
private Content dataSource;
|
||||
private IngestJobContext context;
|
||||
|
||||
/**
|
||||
* Main constructor.
|
||||
*/
|
||||
MessageURLAnalyzer() {
|
||||
moduleName = null;
|
||||
rootTrie = getTrie();
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to determine the host from the url string. If none can be
|
||||
* determined, returns null.
|
||||
*
|
||||
* @param urlString The url string.
|
||||
* @return The host or null if cannot be determined.
|
||||
*/
|
||||
private String getHost(String urlString) {
|
||||
String host = null;
|
||||
try {
|
||||
// try first using the built-in url class to determine the host.
|
||||
URL url = new URL(urlString);
|
||||
if (url != null) {
|
||||
host = url.getHost();
|
||||
@ -239,6 +306,7 @@ class MessageURLAnalyzer extends Extract {
|
||||
// ignore this and go to fallback regex
|
||||
}
|
||||
|
||||
// if the built-in url parsing doesn't work, then use more flexible regex.
|
||||
if (StringUtils.isBlank(host)) {
|
||||
Matcher m = URL_REGEX.matcher(urlString);
|
||||
if (m.find()) {
|
||||
@ -249,23 +317,37 @@ class MessageURLAnalyzer extends Extract {
|
||||
return host;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the host is a message type domain. If so, returns the
|
||||
* portion of the host suffix that signifies the message domain (i.e.
|
||||
* "hotmail.com" or "mail.google.com") and the message type.
|
||||
*
|
||||
* @param host The host.
|
||||
* @return A pair of the host suffix and message type for that suffix if
|
||||
* found. Otherwise, returns null.
|
||||
*/
|
||||
private Pair<String, MessageType> findHostSuffix(String host) {
|
||||
// if no host, return none.
|
||||
if (StringUtils.isBlank(host)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// parse the tokens splitting on delimiter
|
||||
List<String> tokens = Stream.of(host.toLowerCase().split(DELIMITER))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
MessageDomainTrieNode node = rootTrie;
|
||||
// the root node is null indicating we won't be able to do a lookup.
|
||||
if (node == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// iterate through tokens in reverse order
|
||||
int idx = tokens.size() - 1;
|
||||
for (; idx >= 0; idx--) {
|
||||
node = node.getChild(tokens.get(idx));
|
||||
// if we hit a leaf node or we have no matching child node, continue.
|
||||
if (node == null || node.getMessageType() != null) {
|
||||
break;
|
||||
}
|
||||
@ -276,7 +358,8 @@ class MessageURLAnalyzer extends Extract {
|
||||
if (messageType == null) {
|
||||
return null;
|
||||
} else {
|
||||
// the index to be included should be one higher than last index (that was
|
||||
// if there is a message type, we have a result. Concatenate the
|
||||
// appropriate domain tokens and return.
|
||||
int minIndex = Math.max(0, idx);
|
||||
List<String> subList = tokens.subList(minIndex, tokens.size());
|
||||
String hostSuffix = String.join(JOINER, subList);
|
||||
@ -284,6 +367,11 @@ class MessageURLAnalyzer extends Extract {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Goes through web history artifacts and attempts to determine any hosts of
|
||||
* a message type. If any are found, a TSK_DOMAIN_CATEGORY artifact is
|
||||
* created (at most one per host suffix).
|
||||
*/
|
||||
private void findMessageDomains() {
|
||||
if (this.rootTrie == null) {
|
||||
logger.log(Level.SEVERE, "Not analyzing message domain. No root trie loaded.");
|
||||
@ -292,44 +380,54 @@ class MessageURLAnalyzer extends Extract {
|
||||
|
||||
int artifactsAnalyzed = 0;
|
||||
int messageDomainInstancesFound = 0;
|
||||
|
||||
// only one suffix per ingest is captured so this tracks the suffixes seen.
|
||||
Set<String> domainSuffixesSeen = new HashSet<>();
|
||||
|
||||
try {
|
||||
//from blackboard_artifacts
|
||||
Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
|
||||
Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
|
||||
Arrays.asList(dataSource.getId()));
|
||||
|
||||
logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
|
||||
|
||||
|
||||
for (BlackboardArtifact artifact : listArtifacts) {
|
||||
// make sure we haven't cancelled
|
||||
if (context.dataSourceIngestIsCancelled()) {
|
||||
break; //User cancelled the process.
|
||||
}
|
||||
|
||||
// make sure there is attached file
|
||||
AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
|
||||
if (file == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get the url string from the artifact
|
||||
BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
|
||||
if (urlAttr == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String urlString = urlAttr.getValueString();
|
||||
|
||||
// atempt to get the host from the url provided.
|
||||
String host = getHost(urlString);
|
||||
if (StringUtils.isBlank(host)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if we reached this point, we are at least analyzing this item
|
||||
artifactsAnalyzed++;
|
||||
|
||||
// attempt to get the message type for the host using the suffix trie
|
||||
Pair<String, MessageType> messageEntryFound = findHostSuffix(host);
|
||||
if (messageEntryFound == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if we got this far, we found a message domain, but it may not be unique
|
||||
messageDomainInstancesFound++;
|
||||
|
||||
String hostSuffix = messageEntryFound.getLeft();
|
||||
@ -338,6 +436,8 @@ class MessageURLAnalyzer extends Extract {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if we got this far, this is a unique suffix. Add to the set, so we don't create
|
||||
// multiple of same suffix and add an artifact.
|
||||
domainSuffixesSeen.add(hostSuffix);
|
||||
|
||||
String moduleName = Bundle.MessageURLAnalyzer_parentModuleName();
|
||||
@ -370,6 +470,15 @@ class MessageURLAnalyzer extends Extract {
|
||||
this.findMessageDomains();
|
||||
}
|
||||
|
||||
@Override
|
||||
void configExtractor() throws IngestModule.IngestModuleException {
|
||||
try {
|
||||
this.rootTrie = loadTrie();
|
||||
} catch (IOException ex) {
|
||||
throw new IngestModule.IngestModuleException("Unable to load message type csv for domain category analysis", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void complete() {
|
||||
logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS
|
||||
|
Loading…
x
Reference in New Issue
Block a user