Use domain validator instead of list of file extensions to reduce email false positives.

This commit is contained in:
esaunders 2017-02-22 17:50:58 -05:00
parent da9b27d170
commit 5225a0e47d

View File

@ -32,6 +32,7 @@ import java.util.logging.Level;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.commons.validator.routines.DomainValidator;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrRequest;
@ -83,7 +84,6 @@ final class RegexQuery implements KeywordSearchQuery {
private String escapedQuery; private String escapedQuery;
private final int MIN_EMAIL_ADDR_LENGTH = 8; private final int MIN_EMAIL_ADDR_LENGTH = 8;
private final Pattern INVALID_EMAIL_PATTERN = Pattern.compile(".*\\.(dll|txt|exe|jpg|xml)$");
private final ListMultimap<Keyword, KeywordHit> hitsMultiMap = ArrayListMultimap.create(); private final ListMultimap<Keyword, KeywordHit> hitsMultiMap = ArrayListMultimap.create();
@ -281,9 +281,10 @@ final class RegexQuery implements KeywordSearchQuery {
if (originalKeyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_EMAIL) { if (originalKeyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_EMAIL) {
// Reduce false positives by eliminating email address hits that are either // Reduce false positives by eliminating email address hits that are either
// too short or end with well known file externsions. // too short or are not for valid top level domains.
if (hit.length() < MIN_EMAIL_ADDR_LENGTH || INVALID_EMAIL_PATTERN.matcher(hit).matches()) { if (hit.length() < MIN_EMAIL_ADDR_LENGTH
break; || !DomainValidator.getInstance(true).isValidTld(hit.substring(hit.lastIndexOf('.')))) {
continue;
} }
} }