mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
look at the values of content_str field separately: title/content
This commit is contained in:
parent
92d27e0838
commit
3f9b161a71
@ -23,7 +23,6 @@ import java.util.Collection;
|
|||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
@ -223,18 +222,21 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
* will get picked up in the next one. */
|
* will get picked up in the next one. */
|
||||||
final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
||||||
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
||||||
String content_str = Objects.toString(resultDoc.get(Server.Schema.CONTENT_STR.toString()), null);
|
final ArrayList<String> get = (ArrayList<String>) resultDoc.get(Server.Schema.CONTENT_STR.toString());
|
||||||
|
|
||||||
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
||||||
if (indexSchemaVersion < 2.0) {
|
if (indexSchemaVersion < 2.0) {
|
||||||
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
|
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
|
||||||
matches.add(createKeywordtHit(highlightResponse, docId));
|
matches.add(createKeywordtHit(highlightResponse, docId));
|
||||||
} else {
|
} else {
|
||||||
//for new schemas, check that the hit is before the chunk/window boundary.
|
//check against file name and actual content seperately.
|
||||||
int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString);
|
for (String content_str : get) {
|
||||||
//there is no chunksize field for "parent" entries in the index
|
//for new schemas, check that the hit is before the chunk/window boundary.
|
||||||
if (chunkSize != null && firstOccurence < chunkSize) {
|
int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString);
|
||||||
matches.add(createKeywordtHit(highlightResponse, docId));
|
//there is no chunksize field for "parent" entries in the index
|
||||||
|
if (chunkSize != null && firstOccurence > -1 && firstOccurence < chunkSize) {
|
||||||
|
matches.add(createKeywordtHit(highlightResponse, docId));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (TskException ex) {
|
} catch (TskException ex) {
|
||||||
|
@ -54,16 +54,16 @@ import org.sleuthkit.datamodel.TskData;
|
|||||||
import org.sleuthkit.datamodel.TskException;
|
import org.sleuthkit.datamodel.TskException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The RegexQuery class supports issuing regular expression queries
|
* The RegexQuery class supports issuing regular expression queries against a
|
||||||
* against a Lucene index. It relies on the fact that content is
|
* Lucene index. It relies on the fact that content is stored in it's original
|
||||||
* stored in it's original form in a "string" field (Server.Schema.CONTENT_STR).
|
* form in a "string" field (Server.Schema.CONTENT_STR). To indicate to Lucene
|
||||||
* To indicate to Lucene that these are regular expression queries, the query
|
* that these are regular expression queries, the query string must be
|
||||||
* string must be surrounded by '/' characters. Additionally, the characters
|
* surrounded by '/' characters. Additionally, the characters ".*" need to be
|
||||||
* ".*" need to be added both before and after the search term to get hits
|
* added both before and after the search term to get hits in the middle of
|
||||||
* in the middle of text.
|
* text.
|
||||||
*
|
*
|
||||||
* Regular expression syntax supported by Lucene is not the same as Java
|
* Regular expression syntax supported by Lucene is not the same as Java regular
|
||||||
* regular expression syntax. The Lucene syntax is documented here:
|
* expression syntax. The Lucene syntax is documented here:
|
||||||
*
|
*
|
||||||
* https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/util/automaton/RegExp.html
|
* https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/util/automaton/RegExp.html
|
||||||
*/
|
*/
|
||||||
@ -94,7 +94,7 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
// See https://lucene.apache.org/core/6_4_0/core/org/apache/lucene/util/automaton/RegExp.html
|
// See https://lucene.apache.org/core/6_4_0/core/org/apache/lucene/util/automaton/RegExp.html
|
||||||
// for Lucene syntax.
|
// for Lucene syntax.
|
||||||
// We use \p as a shortcut for all of the character classes of the form \p{XXX}.
|
// We use \p as a shortcut for all of the character classes of the form \p{XXX}.
|
||||||
private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n",
|
private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n",
|
||||||
"\\t", "\\r", "\\f", "\\a", "\\e", "\\v", "\\V", "\\h", "\\H", "\\p"}; //NON-NLS
|
"\\t", "\\r", "\\f", "\\a", "\\e", "\\v", "\\V", "\\h", "\\H", "\\p"}; //NON-NLS
|
||||||
|
|
||||||
private boolean queryStringContainsWildcardPrefix = false;
|
private boolean queryStringContainsWildcardPrefix = false;
|
||||||
@ -160,19 +160,18 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* The provided regular expression may include wildcards at the
|
* The provided regular expression may include wildcards at the
|
||||||
* beginning and/or end. These wildcards are used to indicate that
|
* beginning and/or end. These wildcards are used to indicate that the
|
||||||
* the user wants to find hits for the regex that are embedded
|
* user wants to find hits for the regex that are embedded within other
|
||||||
* within other characters. For example, if we are given .*127.0.0.1.*
|
* characters. For example, if we are given .*127.0.0.1.* as a regular
|
||||||
* as a regular expression, this will produce hits for:
|
* expression, this will produce hits for: (a) " 127.0.0.1 " as a
|
||||||
* (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace).
|
* standalone token (surrounded by whitespace). (b) "abc127.0.0.1def"
|
||||||
* (b) "abc127.0.0.1def" where the IP address is surrounded by other characters.
|
* where the IP address is surrounded by other characters.
|
||||||
*
|
*
|
||||||
* If we are given this type of regex, we do not need to add our own
|
* If we are given this type of regex, we do not need to add our own
|
||||||
* wildcards to anchor the query. Otherwise, we need to add wildcard
|
* wildcards to anchor the query. Otherwise, we need to add wildcard
|
||||||
* anchors because Lucene string regex searches default to using ^ and $
|
* anchors because Lucene string regex searches default to using ^ and $
|
||||||
* to match the entire string.
|
* to match the entire string.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// We construct the query by surrounding it with slashes (to indicate it is
|
// We construct the query by surrounding it with slashes (to indicate it is
|
||||||
// a regular expression search) and .* as anchors (if the query doesn't
|
// a regular expression search) and .* as anchors (if the query doesn't
|
||||||
// already have them).
|
// already have them).
|
||||||
@ -193,7 +192,7 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
solrQuery.setRows(MAX_RESULTS);
|
solrQuery.setRows(MAX_RESULTS);
|
||||||
// Setting the sort order is necessary for cursor based paging to work.
|
// Setting the sort order is necessary for cursor based paging to work.
|
||||||
solrQuery.setSort(SortClause.asc(Server.Schema.ID.toString()));
|
solrQuery.setSort(SortClause.asc(Server.Schema.ID.toString()));
|
||||||
|
|
||||||
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
|
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
|
||||||
SolrDocumentList resultList = null;
|
SolrDocumentList resultList = null;
|
||||||
boolean allResultsProcessed = false;
|
boolean allResultsProcessed = false;
|
||||||
@ -214,7 +213,7 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
//
|
//
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
String nextCursorMark = response.getNextCursorMark();
|
String nextCursorMark = response.getNextCursorMark();
|
||||||
if (cursorMark.equals(nextCursorMark)) {
|
if (cursorMark.equals(nextCursorMark)) {
|
||||||
allResultsProcessed = true;
|
allResultsProcessed = true;
|
||||||
@ -225,7 +224,7 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
MessageNotifyUtil.Notify.error(NbBundle.getMessage(Server.class, "Server.query.exception.msg", keywordString), ex.getCause().getMessage());
|
MessageNotifyUtil.Notify.error(NbBundle.getMessage(Server.class, "Server.query.exception.msg", keywordString), ex.getCause().getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Keyword k : hitsMultMap.keySet()) {
|
for (Keyword k : hitsMultMap.keySet()) {
|
||||||
results.addResult(k, hitsMultMap.get(k));
|
results.addResult(k, hitsMultMap.get(k));
|
||||||
}
|
}
|
||||||
@ -239,7 +238,7 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
||||||
final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
||||||
|
|
||||||
String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS
|
ArrayList<String> content_str = (ArrayList<String>) solrDoc.get(Server.Schema.CONTENT_STR.toString());
|
||||||
|
|
||||||
// By default, we create keyword hits on whitespace or punctuation character boundaries.
|
// By default, we create keyword hits on whitespace or punctuation character boundaries.
|
||||||
// Having a set of well defined boundary characters produces hits that can
|
// Having a set of well defined boundary characters produces hits that can
|
||||||
@ -248,8 +247,8 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
// This behavior can be overridden by the user if they give us a search string
|
// This behavior can be overridden by the user if they give us a search string
|
||||||
// with .* at either the start and/or end of the string. This basically tells us find
|
// with .* at either the start and/or end of the string. This basically tells us find
|
||||||
// all hits instead of the ones surrounded by one of our boundary characters.
|
// all hits instead of the ones surrounded by one of our boundary characters.
|
||||||
String keywordTokenRegex =
|
String keywordTokenRegex
|
||||||
// If the given search string starts with .*, we ignore our default
|
= // If the given search string starts with .*, we ignore our default
|
||||||
// boundary prefix characters
|
// boundary prefix characters
|
||||||
(queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS
|
(queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS
|
||||||
+ keywordString
|
+ keywordString
|
||||||
@ -257,63 +256,65 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
// boundary suffix characters
|
// boundary suffix characters
|
||||||
+ (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS
|
+ (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS
|
||||||
|
|
||||||
Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
|
for (String content : content_str) {
|
||||||
int offset = 0;
|
Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
|
||||||
|
int offset = 0;
|
||||||
|
|
||||||
while (hitMatcher.find(offset)) {
|
while (hitMatcher.find(offset)) {
|
||||||
StringBuilder snippet = new StringBuilder();
|
StringBuilder snippet = new StringBuilder();
|
||||||
|
|
||||||
//"parent" entries in the index don't have chunk size, so just accept those hits
|
//"parent" entries in the index don't have chunk size, so just accept those hits
|
||||||
if (chunkSize != null && hitMatcher.start() >= chunkSize) {
|
if (chunkSize != null && hitMatcher.start() >= chunkSize) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
String hit = hitMatcher.group();
|
String hit = hitMatcher.group();
|
||||||
|
|
||||||
// Back the matcher offset up by 1 character as it will have eaten
|
// Back the matcher offset up by 1 character as it will have eaten
|
||||||
// a single space/newline/other boundary character at the end of the hit.
|
// a single space/newline/other boundary character at the end of the hit.
|
||||||
// This was causing us to miss hits that appeared consecutively in the
|
// This was causing us to miss hits that appeared consecutively in the
|
||||||
// input where they were separated by a single boundary character.
|
// input where they were separated by a single boundary character.
|
||||||
offset = hitMatcher.end() - 1;
|
offset = hitMatcher.end() - 1;
|
||||||
|
|
||||||
// Remove any remaining leading and trailing boundary characters.
|
// Remove any remaining leading and trailing boundary characters.
|
||||||
if (!queryStringContainsWildcardPrefix) {
|
if (!queryStringContainsWildcardPrefix) {
|
||||||
hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS
|
hit = hit.replaceAll("^" + BOUNDARY_CHARS, ""); //NON-NLS
|
||||||
}
|
}
|
||||||
if (!queryStringContainsWildcardSuffix) {
|
if (!queryStringContainsWildcardSuffix) {
|
||||||
hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS
|
hit = hit.replaceAll(BOUNDARY_CHARS + "$", ""); //NON-NLS
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If searching for credit card account numbers, do a Luhn check
|
* If searching for credit card account numbers, do a Luhn check
|
||||||
* on the term and discard it if it does not pass.
|
* on the term and discard it if it does not pass.
|
||||||
*/
|
*/
|
||||||
if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
|
if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
|
||||||
Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
|
Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
|
||||||
if (ccnMatcher.find()) {
|
if (ccnMatcher.find()) {
|
||||||
final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
|
final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
|
||||||
if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
|
if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the snippet from the document if keyword search is configured
|
* Get the snippet from the document if keyword search is
|
||||||
* to use snippets.
|
* configured to use snippets.
|
||||||
*/
|
*/
|
||||||
if (KeywordSearchSettings.getShowSnippets()) {
|
if (KeywordSearchSettings.getShowSnippets()) {
|
||||||
int maxIndex = content.length() - 1;
|
int maxIndex = content.length() - 1;
|
||||||
snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1)));
|
snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start() + 1)));
|
||||||
snippet.appendCodePoint(171);
|
snippet.appendCodePoint(171);
|
||||||
snippet.append(hit);
|
snippet.append(hit);
|
||||||
snippet.appendCodePoint(171);
|
snippet.appendCodePoint(171);
|
||||||
snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20)));
|
snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() - 1), Integer.min(maxIndex, hitMatcher.end() + 20)));
|
||||||
}
|
}
|
||||||
|
|
||||||
hits.add(new KeywordHit(docId, snippet.toString(), hit));
|
hits.add(new KeywordHit(docId, snippet.toString(), hit));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return hits;
|
return hits;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user