mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Merge pull request #2497 from millmanorama/2258-look-at-content-and-filename-seperately
2258 look at content and filename seperately
This commit is contained in:
commit
3f92ff0411
@ -23,7 +23,6 @@ import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
@ -223,20 +222,24 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
* will get picked up in the next one. */
|
||||
final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
||||
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
||||
String content_str = Objects.toString(resultDoc.get(Server.Schema.CONTENT_STR.toString()), null);
|
||||
final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
|
||||
|
||||
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
||||
if (indexSchemaVersion < 2.0) {
|
||||
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
|
||||
matches.add(createKeywordtHit(highlightResponse, docId));
|
||||
} else {
|
||||
//check against file name and actual content seperately.
|
||||
for (Object content_obj : content) {
|
||||
String content_str = (String) content_obj;
|
||||
//for new schemas, check that the hit is before the chunk/window boundary.
|
||||
int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString);
|
||||
int firstOccurence = StringUtils.indexOf(content_str.toLowerCase(), strippedQueryString.toLowerCase());
|
||||
//there is no chunksize field for "parent" entries in the index
|
||||
if (chunkSize != null && firstOccurence < chunkSize) {
|
||||
if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) {
|
||||
matches.add(createKeywordtHit(highlightResponse, docId));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (TskException ex) {
|
||||
return matches;
|
||||
}
|
||||
|
@ -54,16 +54,16 @@ import org.sleuthkit.datamodel.TskData;
|
||||
import org.sleuthkit.datamodel.TskException;
|
||||
|
||||
/**
|
||||
* The RegexQuery class supports issuing regular expression queries
|
||||
* against a Lucene index. It relies on the fact that content is
|
||||
* stored in it's original form in a "string" field (Server.Schema.CONTENT_STR).
|
||||
* To indicate to Lucene that these are regular expression queries, the query
|
||||
* string must be surrounded by '/' characters. Additionally, the characters
|
||||
* ".*" need to be added both before and after the search term to get hits
|
||||
* in the middle of text.
|
||||
* The RegexQuery class supports issuing regular expression queries against a
|
||||
* Lucene index. It relies on the fact that content is stored in it's original
|
||||
* form in a "string" field (Server.Schema.CONTENT_STR). To indicate to Lucene
|
||||
* that these are regular expression queries, the query string must be
|
||||
* surrounded by '/' characters. Additionally, the characters ".*" need to be
|
||||
* added both before and after the search term to get hits in the middle of
|
||||
* text.
|
||||
*
|
||||
* Regular expression syntax supported by Lucene is not the same as Java
|
||||
* regular expression syntax. The Lucene syntax is documented here:
|
||||
* Regular expression syntax supported by Lucene is not the same as Java regular
|
||||
* expression syntax. The Lucene syntax is documented here:
|
||||
*
|
||||
* https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/util/automaton/RegExp.html
|
||||
*/
|
||||
@ -160,19 +160,18 @@ final class RegexQuery implements KeywordSearchQuery {
|
||||
|
||||
/**
|
||||
* The provided regular expression may include wildcards at the
|
||||
* beginning and/or end. These wildcards are used to indicate that
|
||||
* the user wants to find hits for the regex that are embedded
|
||||
* within other characters. For example, if we are given .*127.0.0.1.*
|
||||
* as a regular expression, this will produce hits for:
|
||||
* (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace).
|
||||
* (b) "abc127.0.0.1def" where the IP address is surrounded by other characters.
|
||||
* beginning and/or end. These wildcards are used to indicate that the
|
||||
* user wants to find hits for the regex that are embedded within other
|
||||
* characters. For example, if we are given .*127.0.0.1.* as a regular
|
||||
* expression, this will produce hits for: (a) " 127.0.0.1 " as a
|
||||
* standalone token (surrounded by whitespace). (b) "abc127.0.0.1def"
|
||||
* where the IP address is surrounded by other characters.
|
||||
*
|
||||
* If we are given this type of regex, we do not need to add our own
|
||||
* wildcards to anchor the query. Otherwise, we need to add wildcard
|
||||
* anchors because Lucene string regex searches default to using ^ and $
|
||||
* to match the entire string.
|
||||
*/
|
||||
|
||||
// We construct the query by surrounding it with slashes (to indicate it is
|
||||
// a regular expression search) and .* as anchors (if the query doesn't
|
||||
// already have them).
|
||||
@ -239,7 +238,7 @@ final class RegexQuery implements KeywordSearchQuery {
|
||||
final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
||||
final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
||||
|
||||
String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS
|
||||
final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
|
||||
|
||||
// By default, we create keyword hits on whitespace or punctuation character boundaries.
|
||||
// Having a set of well defined boundary characters produces hits that can
|
||||
@ -248,15 +247,17 @@ final class RegexQuery implements KeywordSearchQuery {
|
||||
// This behavior can be overridden by the user if they give us a search string
|
||||
// with .* at either the start and/or end of the string. This basically tells us find
|
||||
// all hits instead of the ones surrounded by one of our boundary characters.
|
||||
String keywordTokenRegex =
|
||||
// If the given search string starts with .*, we ignore our default
|
||||
String keywordTokenRegex
|
||||
= // If the given search string starts with .*, we ignore our default
|
||||
// boundary prefix characters
|
||||
(queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS
|
||||
(queryStringContainsWildcardPrefix ? "" : "(^|" + BOUNDARY_CHARS + ")") //NON-NLS
|
||||
+ keywordString
|
||||
// If the given search string ends with .*, we ignore our default
|
||||
// boundary suffix characters
|
||||
+ (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS
|
||||
+ (queryStringContainsWildcardSuffix ? "" : "($|" + BOUNDARY_CHARS + ")"); //NON-NLS
|
||||
|
||||
for (Object content_obj : content_str) {
|
||||
String content = (String) content_obj;
|
||||
Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
|
||||
int offset = 0;
|
||||
|
||||
@ -301,8 +302,8 @@ final class RegexQuery implements KeywordSearchQuery {
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the snippet from the document if keyword search is configured
|
||||
* to use snippets.
|
||||
* Get the snippet from the document if keyword search is
|
||||
* configured to use snippets.
|
||||
*/
|
||||
if (KeywordSearchSettings.getShowSnippets()) {
|
||||
int maxIndex = content.length() - 1;
|
||||
@ -315,6 +316,7 @@ final class RegexQuery implements KeywordSearchQuery {
|
||||
|
||||
hits.add(new KeywordHit(docId, snippet.toString(), hit));
|
||||
}
|
||||
}
|
||||
return hits;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user