look at the values of content_str field separately: title/content

This commit is contained in:
millmanorama 2017-02-01 16:48:52 +01:00
parent 92d27e0838
commit 3f9b161a71
2 changed files with 79 additions and 76 deletions

View File

@ -23,7 +23,6 @@ import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import java.util.logging.Level; import java.util.logging.Level;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
@ -223,20 +222,23 @@ class LuceneQuery implements KeywordSearchQuery {
* will get picked up in the next one. */ * will get picked up in the next one. */
final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString(); final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
String content_str = Objects.toString(resultDoc.get(Server.Schema.CONTENT_STR.toString()), null); final ArrayList<String> get = (ArrayList<String>) resultDoc.get(Server.Schema.CONTENT_STR.toString());
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion()); double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
if (indexSchemaVersion < 2.0) { if (indexSchemaVersion < 2.0) {
//old schema versions don't support chunk_size or the content_str fields, so just accept hits //old schema versions don't support chunk_size or the content_str fields, so just accept hits
matches.add(createKeywordtHit(highlightResponse, docId)); matches.add(createKeywordtHit(highlightResponse, docId));
} else { } else {
//check against file name and actual content seperately.
for (String content_str : get) {
//for new schemas, check that the hit is before the chunk/window boundary. //for new schemas, check that the hit is before the chunk/window boundary.
int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString); int firstOccurence = StringUtils.indexOf(content_str, strippedQueryString);
//there is no chunksize field for "parent" entries in the index //there is no chunksize field for "parent" entries in the index
if (chunkSize != null && firstOccurence < chunkSize) { if (chunkSize != null && firstOccurence > -1 && firstOccurence < chunkSize) {
matches.add(createKeywordtHit(highlightResponse, docId)); matches.add(createKeywordtHit(highlightResponse, docId));
} }
} }
}
} catch (TskException ex) { } catch (TskException ex) {
return matches; return matches;
} }

View File

@ -54,16 +54,16 @@ import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskException; import org.sleuthkit.datamodel.TskException;
/** /**
* The RegexQuery class supports issuing regular expression queries * The RegexQuery class supports issuing regular expression queries against a
* against a Lucene index. It relies on the fact that content is * Lucene index. It relies on the fact that content is stored in it's original
* stored in it's original form in a "string" field (Server.Schema.CONTENT_STR). * form in a "string" field (Server.Schema.CONTENT_STR). To indicate to Lucene
* To indicate to Lucene that these are regular expression queries, the query * that these are regular expression queries, the query string must be
* string must be surrounded by '/' characters. Additionally, the characters * surrounded by '/' characters. Additionally, the characters ".*" need to be
* ".*" need to be added both before and after the search term to get hits * added both before and after the search term to get hits in the middle of
* in the middle of text. * text.
* *
* Regular expression syntax supported by Lucene is not the same as Java * Regular expression syntax supported by Lucene is not the same as Java regular
* regular expression syntax. The Lucene syntax is documented here: * expression syntax. The Lucene syntax is documented here:
* *
* https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/util/automaton/RegExp.html * https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/util/automaton/RegExp.html
*/ */
@ -160,19 +160,18 @@ final class RegexQuery implements KeywordSearchQuery {
/** /**
* The provided regular expression may include wildcards at the * The provided regular expression may include wildcards at the
* beginning and/or end. These wildcards are used to indicate that * beginning and/or end. These wildcards are used to indicate that the
* the user wants to find hits for the regex that are embedded * user wants to find hits for the regex that are embedded within other
* within other characters. For example, if we are given .*127.0.0.1.* * characters. For example, if we are given .*127.0.0.1.* as a regular
* as a regular expression, this will produce hits for: * expression, this will produce hits for: (a) " 127.0.0.1 " as a
* (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace). * standalone token (surrounded by whitespace). (b) "abc127.0.0.1def"
* (b) "abc127.0.0.1def" where the IP address is surrounded by other characters. * where the IP address is surrounded by other characters.
* *
* If we are given this type of regex, we do not need to add our own * If we are given this type of regex, we do not need to add our own
* wildcards to anchor the query. Otherwise, we need to add wildcard * wildcards to anchor the query. Otherwise, we need to add wildcard
* anchors because Lucene string regex searches default to using ^ and $ * anchors because Lucene string regex searches default to using ^ and $
* to match the entire string. * to match the entire string.
*/ */
// We construct the query by surrounding it with slashes (to indicate it is // We construct the query by surrounding it with slashes (to indicate it is
// a regular expression search) and .* as anchors (if the query doesn't // a regular expression search) and .* as anchors (if the query doesn't
// already have them). // already have them).
@ -239,7 +238,7 @@ final class RegexQuery implements KeywordSearchQuery {
final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString(); final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS ArrayList<String> content_str = (ArrayList<String>) solrDoc.get(Server.Schema.CONTENT_STR.toString());
// By default, we create keyword hits on whitespace or punctuation character boundaries. // By default, we create keyword hits on whitespace or punctuation character boundaries.
// Having a set of well defined boundary characters produces hits that can // Having a set of well defined boundary characters produces hits that can
@ -248,8 +247,8 @@ final class RegexQuery implements KeywordSearchQuery {
// This behavior can be overridden by the user if they give us a search string // This behavior can be overridden by the user if they give us a search string
// with .* at either the start and/or end of the string. This basically tells us find // with .* at either the start and/or end of the string. This basically tells us find
// all hits instead of the ones surrounded by one of our boundary characters. // all hits instead of the ones surrounded by one of our boundary characters.
String keywordTokenRegex = String keywordTokenRegex
// If the given search string starts with .*, we ignore our default = // If the given search string starts with .*, we ignore our default
// boundary prefix characters // boundary prefix characters
(queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS (queryStringContainsWildcardPrefix ? "" : BOUNDARY_CHARS) //NON-NLS
+ keywordString + keywordString
@ -257,6 +256,7 @@ final class RegexQuery implements KeywordSearchQuery {
// boundary suffix characters // boundary suffix characters
+ (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS + (queryStringContainsWildcardSuffix ? "" : BOUNDARY_CHARS); //NON-NLS
for (String content : content_str) {
Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content); Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
int offset = 0; int offset = 0;
@ -301,8 +301,8 @@ final class RegexQuery implements KeywordSearchQuery {
} }
/** /**
* Get the snippet from the document if keyword search is configured * Get the snippet from the document if keyword search is
* to use snippets. * configured to use snippets.
*/ */
if (KeywordSearchSettings.getShowSnippets()) { if (KeywordSearchSettings.getShowSnippets()) {
int maxIndex = content.length() - 1; int maxIndex = content.length() - 1;
@ -315,6 +315,7 @@ final class RegexQuery implements KeywordSearchQuery {
hits.add(new KeywordHit(docId, snippet.toString(), hit)); hits.add(new KeywordHit(docId, snippet.toString(), hit));
} }
}
return hits; return hits;
} }