mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-19 11:07:43 +00:00
Modified creation of regex keyword hits to break on a whitespace or punctuation boundary to support consistent highlighting. Also added HighlightedText.attemptManualHighlighting() for those situations where the Lucene highlighter doesn't give us useful results.
This commit is contained in:
parent
4b80395b9d
commit
0e925e6823
@ -61,7 +61,7 @@ class AccountsText implements IndexedText {
|
|||||||
private static final String INSERT_POSTFIX = "'></a>$0"; //$0 will insert current regex match //NON-NLS
|
private static final String INSERT_POSTFIX = "'></a>$0"; //$0 will insert current regex match //NON-NLS
|
||||||
private static final Pattern ANCHOR_DETECTION_PATTERN = Pattern.compile(HIGHLIGHT_PRE);
|
private static final Pattern ANCHOR_DETECTION_PATTERN = Pattern.compile(HIGHLIGHT_PRE);
|
||||||
|
|
||||||
private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
|
private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD;
|
||||||
|
|
||||||
private final Server solrServer;
|
private final Server solrServer;
|
||||||
private final String solrDocumentId;
|
private final String solrDocumentId;
|
||||||
|
@ -25,12 +25,15 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
|
import org.apache.commons.lang.StringEscapeUtils;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.apache.solr.client.solrj.SolrQuery;
|
import org.apache.solr.client.solrj.SolrQuery;
|
||||||
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
||||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||||
|
import org.apache.solr.common.SolrDocumentList;
|
||||||
import org.openide.util.NbBundle.Messages;
|
import org.openide.util.NbBundle.Messages;
|
||||||
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
|
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
|
||||||
import org.sleuthkit.autopsy.coreutils.Version;
|
import org.sleuthkit.autopsy.coreutils.Version;
|
||||||
@ -82,9 +85,23 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//when the results are not known and need to requery to get hits
|
/**
|
||||||
HighlightedText(long objectId, String solrQuery, boolean isRegex, String originalQuery) {
|
* This constructor is used when keyword hits are accessed from the
|
||||||
this(objectId, KeywordSearchUtil.quoteQuery(solrQuery), isRegex);
|
* "Keyword Hits" node in the directory tree in Autopsy.
|
||||||
|
* In that case we only have the keyword for which a hit had
|
||||||
|
* previously been found so we will need to re-query to find hits
|
||||||
|
* for the keyword.
|
||||||
|
*
|
||||||
|
* @param objectId
|
||||||
|
* @param keyword The keyword that was found previously (e.g. during ingest)
|
||||||
|
* @param isRegex true if the keyword was found via a regular expression search
|
||||||
|
* @param originalQuery The original query string that produced the hit. If
|
||||||
|
* isRegex is true, this will be the regular expression that produced the hit.
|
||||||
|
*/
|
||||||
|
HighlightedText(long objectId, String keyword, boolean isRegex, String originalQuery) {
|
||||||
|
// The keyword can be treated as a literal hit at this point so we
|
||||||
|
// surround it in quotes.
|
||||||
|
this(objectId, KeywordSearchUtil.quoteQuery(keyword), isRegex);
|
||||||
this.originalQuery = originalQuery;
|
this.originalQuery = originalQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -123,25 +140,15 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
|
|||||||
hasChunks = true;
|
hasChunks = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
//if has chunks, get pages with hits
|
// if the file has chunks, get pages with hits, sorted
|
||||||
if (hasChunks) {
|
if (hasChunks) {
|
||||||
//extract pages of interest, sorted
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this is being called from the artifacts / dir tree, then we
|
* If this is being called from the artifacts / dir tree, then we
|
||||||
* need to perform the search to get the highlights.
|
* need to perform the search to get the highlights.
|
||||||
*/
|
*/
|
||||||
if (hits == null) {
|
if (hits == null) {
|
||||||
// I don't undertand how we could get into this code with a regex query.
|
Keyword keywordQuery = new Keyword(keywordHitQuery, !isRegex);
|
||||||
// Won't all regex queries have been resolved to actual literal keyword hits
|
|
||||||
// by the time we attempt to load page content? EGS.
|
|
||||||
// String queryStr = KeywordSearchUtil.escapeLuceneQuery(this.keywordHitQuery);
|
|
||||||
// if (isRegex) {
|
|
||||||
// //use white-space sep. field to get exact matches only of regex query result
|
|
||||||
// queryStr = Server.Schema.CONTENT_WS + ":" + "\"" + queryStr + "\"";
|
|
||||||
// }
|
|
||||||
|
|
||||||
Keyword keywordQuery = new Keyword(this.keywordHitQuery, !isRegex);
|
|
||||||
List<Keyword> keywords = new ArrayList<>();
|
List<Keyword> keywords = new ArrayList<>();
|
||||||
keywords.add(keywordQuery);
|
keywords.add(keywordQuery);
|
||||||
KeywordSearchQuery chunksQuery = new LuceneQuery(new KeywordList(keywords), keywordQuery);
|
KeywordSearchQuery chunksQuery = new LuceneQuery(new KeywordList(keywords), keywordQuery);
|
||||||
@ -303,14 +310,6 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
|
|||||||
public String getText() {
|
public String getText() {
|
||||||
loadPageInfo(); //inits once
|
loadPageInfo(); //inits once
|
||||||
|
|
||||||
String highLightField = null;
|
|
||||||
|
|
||||||
if (isRegex) {
|
|
||||||
highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
|
|
||||||
} else {
|
|
||||||
highLightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
SolrQuery q = new SolrQuery();
|
SolrQuery q = new SolrQuery();
|
||||||
q.setShowDebugInfo(DEBUG); //debug
|
q.setShowDebugInfo(DEBUG); //debug
|
||||||
|
|
||||||
@ -324,10 +323,8 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
|
|||||||
|
|
||||||
final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
|
final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
|
||||||
q.addFilterQuery(filterQuery);
|
q.addFilterQuery(filterQuery);
|
||||||
q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)
|
q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
|
||||||
|
|
||||||
//q.setHighlightSimplePre(HIGHLIGHT_PRE); //original highlighter only
|
|
||||||
//q.setHighlightSimplePost(HIGHLIGHT_POST); //original highlighter only
|
|
||||||
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
|
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
|
||||||
|
|
||||||
//tune the highlighter
|
//tune the highlighter
|
||||||
@ -341,22 +338,33 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
QueryResponse response = solrServer.query(q, METHOD.POST);
|
QueryResponse response = solrServer.query(q, METHOD.POST);
|
||||||
|
|
||||||
|
// There should never be more than one document since there will
|
||||||
|
// either be a single chunk containing hits or we narrow our
|
||||||
|
// query down to the current page/chunk.
|
||||||
|
if (response.getResults().size() > 1) {
|
||||||
|
logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", keywordHitQuery); //NON-NLS
|
||||||
|
}
|
||||||
|
|
||||||
Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
|
Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
|
||||||
|
|
||||||
Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
|
Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
|
||||||
if (responseHighlightID == null) {
|
String highlightedContent;
|
||||||
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
|
|
||||||
}
|
|
||||||
List<String> contentHighlights = responseHighlightID.get(highLightField);
|
|
||||||
if (contentHighlights == null) {
|
|
||||||
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
|
|
||||||
} else {
|
|
||||||
// extracted content (minus highlight tags) is HTML-escaped
|
|
||||||
String highlightedContent = contentHighlights.get(0).trim();
|
|
||||||
highlightedContent = insertAnchors(highlightedContent);
|
|
||||||
|
|
||||||
return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
|
if (responseHighlightID == null) {
|
||||||
|
highlightedContent = attemptManualHighlighting(response.getResults());
|
||||||
|
} else {
|
||||||
|
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
||||||
|
if (contentHighlights == null) {
|
||||||
|
highlightedContent = attemptManualHighlighting(response.getResults());
|
||||||
|
} else {
|
||||||
|
// extracted content (minus highlight tags) is HTML-escaped
|
||||||
|
highlightedContent = contentHighlights.get(0).trim();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
highlightedContent = insertAnchors(highlightedContent);
|
||||||
|
|
||||||
|
return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.log(Level.WARNING, "Error executing Solr highlighting query: " + keywordHitQuery, ex); //NON-NLS
|
logger.log(Level.WARNING, "Error executing Solr highlighting query: " + keywordHitQuery, ex); //NON-NLS
|
||||||
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg");
|
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg");
|
||||||
@ -386,6 +394,73 @@ class HighlightedText implements IndexedText, TextMarkupLookup {
|
|||||||
return this.hitsPages.get(this.currentPage);
|
return this.hitsPages.get(this.currentPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the Solr query does not produce valid highlighting, we attempt to
|
||||||
|
* add the highlighting ourselves. We do this by taking the text returned
|
||||||
|
* from the document that contains a hit and searching that text for the
|
||||||
|
* keyword that produced the hit.
|
||||||
|
*
|
||||||
|
* @param solrDocumentList The list of Solr documents returned in response
|
||||||
|
* to a Solr query. We expect there to only ever be a single document.
|
||||||
|
*
|
||||||
|
* @return Either a string with the keyword highlighted or a string
|
||||||
|
* indicating that we did not find a hit in the document.
|
||||||
|
*/
|
||||||
|
private String attemptManualHighlighting(SolrDocumentList solrDocumentList) {
|
||||||
|
if (solrDocumentList.isEmpty()) {
|
||||||
|
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
|
||||||
|
}
|
||||||
|
|
||||||
|
// It doesn't make sense for there to be more than a single document in
|
||||||
|
// the list since this class presents a single page (document) of highlighted
|
||||||
|
// content at a time.
|
||||||
|
String text = solrDocumentList.get(0).getOrDefault(Server.Schema.TEXT.toString(), "").toString();
|
||||||
|
|
||||||
|
// Escape any HTML content that may be in the text. This is needed in
|
||||||
|
// order to correctly display the text in the content viewer.
|
||||||
|
// Must be done before highlighting tags are added. If we were to
|
||||||
|
// perform HTML escaping after adding the highlighting tags we would
|
||||||
|
// not see highlighted text in the content viewer.
|
||||||
|
text = StringEscapeUtils.escapeHtml(text);
|
||||||
|
|
||||||
|
StringBuilder highlightedText = new StringBuilder("");
|
||||||
|
|
||||||
|
int textOffset = 0;
|
||||||
|
// Remove quotes from around the keyword.
|
||||||
|
String unquotedKeyword = StringUtils.strip(keywordHitQuery, "\"");
|
||||||
|
// Find the first (if any) hit.
|
||||||
|
int hitOffset = text.indexOf(unquotedKeyword, textOffset);
|
||||||
|
|
||||||
|
while (hitOffset != -1) {
|
||||||
|
// Append the portion of text up to (but not including) the hit.
|
||||||
|
highlightedText.append(text.substring(textOffset, hitOffset));
|
||||||
|
// Add in the highlighting around the keyword.
|
||||||
|
highlightedText.append(HIGHLIGHT_PRE);
|
||||||
|
highlightedText.append(unquotedKeyword);
|
||||||
|
highlightedText.append(HIGHLIGHT_POST);
|
||||||
|
|
||||||
|
// Advance the text offset past the keyword.
|
||||||
|
textOffset = hitOffset + unquotedKeyword.length() + 1;
|
||||||
|
// Search for the next keyword hit in the text.
|
||||||
|
hitOffset = text.indexOf(unquotedKeyword, textOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (highlightedText.length() > 0) {
|
||||||
|
// Append the remainder of text field and return.
|
||||||
|
highlightedText.append(text.substring(textOffset, text.length()));
|
||||||
|
return highlightedText.toString();
|
||||||
|
} else {
|
||||||
|
return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Anchors are used to navigate back and forth between hits on the same
|
||||||
|
* page and to navigate to hits on the next/previous page.
|
||||||
|
*
|
||||||
|
* @param searchableContent
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
private String insertAnchors(String searchableContent) {
|
private String insertAnchors(String searchableContent) {
|
||||||
int searchOffset = 0;
|
int searchOffset = 0;
|
||||||
int index = -1;
|
int index = -1;
|
||||||
|
@ -298,13 +298,7 @@ class KeywordSearchResultFactory extends ChildFactory<KeyValueQueryContent> {
|
|||||||
*/
|
*/
|
||||||
private String constructEscapedSolrQuery(String query, boolean literal_query) {
|
private String constructEscapedSolrQuery(String query, boolean literal_query) {
|
||||||
StringBuilder highlightQuery = new StringBuilder();
|
StringBuilder highlightQuery = new StringBuilder();
|
||||||
String highLightField;
|
highlightQuery.append(LuceneQuery.HIGHLIGHT_FIELD).append(":").append("\"").append(KeywordSearchUtil.escapeLuceneQuery(query)).append("\"");
|
||||||
if (literal_query) {
|
|
||||||
highLightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
|
|
||||||
} else {
|
|
||||||
highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
|
|
||||||
}
|
|
||||||
highlightQuery.append(highLightField).append(":").append("\"").append(KeywordSearchUtil.escapeLuceneQuery(query)).append("\"");
|
|
||||||
return highlightQuery.toString();
|
return highlightQuery.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,11 +59,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
private String field = null;
|
private String field = null;
|
||||||
private static final int MAX_RESULTS = 20000;
|
private static final int MAX_RESULTS = 20000;
|
||||||
static final int SNIPPET_LENGTH = 50;
|
static final int SNIPPET_LENGTH = 50;
|
||||||
//can use different highlight schema fields for regex and literal search
|
static final String HIGHLIGHT_FIELD = Server.Schema.TEXT.toString();
|
||||||
static final String HIGHLIGHT_FIELD_LITERAL = Server.Schema.TEXT.toString();
|
|
||||||
static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.TEXT.toString();
|
|
||||||
//TODO use content_ws stored="true" in solr schema for perfect highlight hits
|
|
||||||
//static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.CONTENT_WS.toString()
|
|
||||||
|
|
||||||
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
|
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
|
||||||
|
|
||||||
@ -250,13 +246,15 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
|
private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
|
||||||
SolrQuery q = new SolrQuery();
|
SolrQuery q = new SolrQuery();
|
||||||
q.setShowDebugInfo(DEBUG); //debug
|
q.setShowDebugInfo(DEBUG); //debug
|
||||||
//set query, force quotes/grouping around all literal queries
|
// Wrap the query string in quotes if this is a literal search term.
|
||||||
final String groupedQuery = KeywordSearchUtil.quoteQuery(keywordStringEscaped);
|
String theQueryStr = keyword.searchTermIsLiteral()
|
||||||
String theQueryStr = groupedQuery;
|
? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
|
||||||
|
|
||||||
|
// Run the query against an optional alternative field.
|
||||||
if (field != null) {
|
if (field != null) {
|
||||||
//use the optional field
|
//use the optional field
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
sb.append(field).append(":").append(groupedQuery);
|
sb.append(field).append(":").append(theQueryStr);
|
||||||
theQueryStr = sb.toString();
|
theQueryStr = sb.toString();
|
||||||
}
|
}
|
||||||
q.setQuery(theQueryStr);
|
q.setQuery(theQueryStr);
|
||||||
@ -345,20 +343,13 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
public static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {
|
public static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {
|
||||||
Server solrServer = KeywordSearch.getServer();
|
Server solrServer = KeywordSearch.getServer();
|
||||||
|
|
||||||
String highlightField;
|
|
||||||
if (isRegex) {
|
|
||||||
highlightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
|
|
||||||
} else {
|
|
||||||
highlightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
SolrQuery q = new SolrQuery();
|
SolrQuery q = new SolrQuery();
|
||||||
|
|
||||||
String queryStr;
|
String queryStr;
|
||||||
|
|
||||||
if (isRegex) {
|
if (isRegex) {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
sb.append(highlightField).append(":");
|
sb.append(LuceneQuery.HIGHLIGHT_FIELD).append(":");
|
||||||
if (group) {
|
if (group) {
|
||||||
sb.append("\"");
|
sb.append("\"");
|
||||||
}
|
}
|
||||||
@ -387,7 +378,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);
|
String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);
|
||||||
q.setShowDebugInfo(DEBUG); //debug
|
q.setShowDebugInfo(DEBUG); //debug
|
||||||
q.addFilterQuery(idQuery);
|
q.addFilterQuery(idQuery);
|
||||||
q.addHighlightField(highlightField);
|
q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
|
||||||
//q.setHighlightSimplePre("«"); //original highlighter only
|
//q.setHighlightSimplePre("«"); //original highlighter only
|
||||||
//q.setHighlightSimplePost("»"); //original highlighter only
|
//q.setHighlightSimplePost("»"); //original highlighter only
|
||||||
q.setHighlightSnippets(1);
|
q.setHighlightSnippets(1);
|
||||||
@ -413,7 +404,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
if (responseHighlightID == null) {
|
if (responseHighlightID == null) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
List<String> contentHighlights = responseHighlightID.get(highlightField);
|
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
||||||
if (contentHighlights == null) {
|
if (contentHighlights == null) {
|
||||||
return "";
|
return "";
|
||||||
} else {
|
} else {
|
||||||
|
@ -78,6 +78,19 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
private boolean escaped;
|
private boolean escaped;
|
||||||
private String escapedQuery;
|
private String escapedQuery;
|
||||||
|
|
||||||
|
// These are the valid characters that can appear immediately before a
|
||||||
|
// keyword hit. e.g. for an IP address regex we support finding the string
|
||||||
|
// ",10.0.0.0" but not "?10.0.0.0".
|
||||||
|
private static final String BOUNDARY_PREFIX_CHARS = "(\\s|\\[|\\(|,|\\:)"; //NON-NLS
|
||||||
|
|
||||||
|
// These are the valid characters that can appear immediately after a
|
||||||
|
// keyword hit. e.g. for an IP address regex we support finding the string
|
||||||
|
// "10.0.0.0?]" but not "10.0.0.0&".
|
||||||
|
private static final String BOUNDARY_SUFFIX_CHARS = "(\\s|\\]|\\)|,|!|\\?|\\:)"; //NON-NLS
|
||||||
|
|
||||||
|
private boolean queryStringContainsWildcardPrefix = false;
|
||||||
|
private boolean queryStringContainsWildcardSuffix = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor with query to process.
|
* Constructor with query to process.
|
||||||
*
|
*
|
||||||
@ -88,6 +101,14 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
this.keywordList = keywordList;
|
this.keywordList = keywordList;
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
this.keywordString = keyword.getSearchTerm();
|
this.keywordString = keyword.getSearchTerm();
|
||||||
|
|
||||||
|
if (this.keywordString.startsWith(".*")) {
|
||||||
|
this.queryStringContainsWildcardPrefix = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.keywordString.endsWith(".*")) {
|
||||||
|
this.queryStringContainsWildcardSuffix = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -120,7 +141,28 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
SolrQuery solrQuery = new SolrQuery();
|
SolrQuery solrQuery = new SolrQuery();
|
||||||
solrQuery.setShowDebugInfo(true); //debug
|
solrQuery.setShowDebugInfo(true); //debug
|
||||||
|
|
||||||
solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/.*" + getQueryString() + ".*/");
|
/**
|
||||||
|
* The provided regular expression may include wildcards at the
|
||||||
|
* beginning and/or end. These wildcards are used to indicate that
|
||||||
|
* the user wants to find hits for the regex that are embedded
|
||||||
|
* within other characters. For example, if we are given .*127.0.0.1.*
|
||||||
|
* as a regular expression, this will produce hits for:
|
||||||
|
* (a) " 127.0.0.1 " as a standalone token (surrounded by whitespace).
|
||||||
|
* (b) "abc127.0.0.1def" where the IP address is surrounded by other characters.
|
||||||
|
*
|
||||||
|
* If we are given this type of regex, we do not need to add our own
|
||||||
|
* wildcards to anchor the query. Otherwise, we need to add wildcard
|
||||||
|
* anchors because Lucene string regex searches default to using ^ and $
|
||||||
|
* to match the entire string.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// We construct the query by surrounding it with slashes (to indicate it is
|
||||||
|
// a regular expression search) and .* as anchors (if the query doesn't
|
||||||
|
// already have them).
|
||||||
|
solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/"
|
||||||
|
+ (queryStringContainsWildcardPrefix ? "" : ".*") + getQueryString()
|
||||||
|
+ (queryStringContainsWildcardSuffix ? "" : ".*") + "/");
|
||||||
|
|
||||||
solrQuery.setRows(MAX_RESULTS);
|
solrQuery.setRows(MAX_RESULTS);
|
||||||
|
|
||||||
// Set the fields we want to have returned by the query.
|
// Set the fields we want to have returned by the query.
|
||||||
@ -173,22 +215,50 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
List<KeywordHit> hits = new ArrayList<>();
|
List<KeywordHit> hits = new ArrayList<>();
|
||||||
final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
|
||||||
|
|
||||||
String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString();
|
String content = solrDoc.getOrDefault(Server.Schema.CONTENT_STR.toString(), "").toString(); //NON-NLS
|
||||||
|
|
||||||
Matcher hitMatcher = Pattern.compile(keywordString).matcher(content);
|
// By default, we create keyword hits on whitespace or punctuation character boundaries.
|
||||||
|
// Having a set of well defined boundary characters produces hits that can
|
||||||
|
// subsequently be matched for highlighting against the tokens produced by
|
||||||
|
// the standard tokenizer.
|
||||||
|
// This behavior can be overridden by the user if they give us a search string
|
||||||
|
// with .* at either the start and/or end of the string. This basically tells us find
|
||||||
|
// all hits instead of the ones surrounded by one of our boundary characters.
|
||||||
|
String keywordTokenRegex =
|
||||||
|
// If the given search string starts with .*, we ignore our default
|
||||||
|
// boundary prefix characters
|
||||||
|
(queryStringContainsWildcardPrefix ? "" : BOUNDARY_PREFIX_CHARS) //NON-NLS
|
||||||
|
+ keywordString
|
||||||
|
// If the given search string ends with .*, we ignore our default
|
||||||
|
// boundary suffix characters
|
||||||
|
+ (queryStringContainsWildcardSuffix ? "" : BOUNDARY_SUFFIX_CHARS); //NON-NLS
|
||||||
|
|
||||||
|
Matcher hitMatcher = Pattern.compile(keywordTokenRegex).matcher(content);
|
||||||
|
|
||||||
while (hitMatcher.find()) {
|
while (hitMatcher.find()) {
|
||||||
String snippet = "";
|
StringBuilder snippet = new StringBuilder();
|
||||||
final String hit = hitMatcher.group();
|
String hit = hitMatcher.group();
|
||||||
|
|
||||||
|
// Remove leading and trailing boundary characters.
|
||||||
|
if (!queryStringContainsWildcardPrefix) {
|
||||||
|
hit = hit.replaceAll("^" + BOUNDARY_PREFIX_CHARS, ""); //NON-NLS
|
||||||
|
}
|
||||||
|
if (!queryStringContainsWildcardSuffix) {
|
||||||
|
hit = hit.replaceAll(BOUNDARY_SUFFIX_CHARS + "$", ""); //NON-NLS
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If searching for credit card account numbers, do a Luhn check
|
* If searching for credit card account numbers, do a Luhn check
|
||||||
* on the term and discard it if it does not pass.
|
* on the term and discard it if it does not pass.
|
||||||
*/
|
*/
|
||||||
if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
|
if (keyword.getArtifactAttributeType() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
|
||||||
Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
|
Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
|
||||||
ccnMatcher.find();
|
if (ccnMatcher.find()) {
|
||||||
final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
|
final String ccn = CharMatcher.anyOf(" -").removeFrom(ccnMatcher.group("ccn"));
|
||||||
if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
|
if (false == TermsComponentQuery.CREDIT_CARD_NUM_LUHN_CHECK.isValid(ccn)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -199,12 +269,14 @@ final class RegexQuery implements KeywordSearchQuery {
|
|||||||
*/
|
*/
|
||||||
if (KeywordSearchSettings.getShowSnippets()) {
|
if (KeywordSearchSettings.getShowSnippets()) {
|
||||||
int maxIndex = content.length() - 1;
|
int maxIndex = content.length() - 1;
|
||||||
snippet = content.substring(Integer.max(0, hitMatcher.start() - 30), Integer.max(0, hitMatcher.start() - 1));
|
snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 30), Integer.max(0, hitMatcher.start() - 1)));
|
||||||
snippet += "<<" + hit + "<<";
|
snippet.appendCodePoint(171);
|
||||||
snippet += content.substring(Integer.min(maxIndex, hitMatcher.end() + 1), Integer.min(maxIndex, hitMatcher.end() + 30));
|
snippet.append(hit);
|
||||||
|
snippet.appendCodePoint(171);
|
||||||
|
snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end() + 1), Integer.min(maxIndex, hitMatcher.end() + 30)));
|
||||||
}
|
}
|
||||||
|
|
||||||
hits.add(new KeywordHit(docId, snippet, hit));
|
hits.add(new KeywordHit(docId, snippet.toString(), hit));
|
||||||
}
|
}
|
||||||
return hits;
|
return hits;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user