Increase query accuracy and reduce query postprocessing needed (at least for case insensitive queries) by quering only the ws delim Solr field in the aggregate query to get file ids

This commit is contained in:
adam-m 2012-01-10 17:12:29 -05:00
parent 6a6753868e
commit 52b29a2892
5 changed files with 113 additions and 86 deletions

View File

@ -57,11 +57,11 @@ class HighlightedMatchesSource implements MarkupSource {
public String getMarkup() { public String getMarkup() {
SolrQuery q = new SolrQuery(); SolrQuery q = new SolrQuery();
final String queryEscaped = KeywordSearchUtil.escapeLuceneQuery(solrQuery, true); final String queryEscaped = KeywordSearchUtil.escapeLuceneQuery(solrQuery, true, false);
q.setQuery(queryEscaped); q.setQuery(queryEscaped);
q.addFilterQuery("id:" + content.getId()); q.addFilterQuery("id:" + content.getId());
q.addHighlightField("content"); q.addHighlightField("content"); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)
q.setHighlightSimplePre(HIGHLIGHT_PRE); q.setHighlightSimplePre(HIGHLIGHT_PRE);
q.setHighlightSimplePost(HIGHLIGHT_POST); q.setHighlightSimplePost(HIGHLIGHT_POST);
q.setHighlightFragsize(0); // don't fragment the highlight q.setHighlightFragsize(0); // don't fragment the highlight

View File

@ -70,8 +70,7 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
public String toString() { public String toString() {
return "Match"; return "Match";
} }
}, },}
}
private Presentation presentation; private Presentation presentation;
private Collection<String> queries; private Collection<String> queries;
private Collection<KeyValueThing> things; private Collection<KeyValueThing> things;
@ -149,6 +148,7 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
childFactory = new ResultCollapsedChildFactory(thing); childFactory = new ResultCollapsedChildFactory(thing);
final Node ret = new KeyValueNode(thing, Children.create(childFactory, true)); final Node ret = new KeyValueNode(thing, Children.create(childFactory, true));
SwingUtilities.invokeLater(new Runnable() { SwingUtilities.invokeLater(new Runnable() {
@Override @Override
public void run() { public void run() {
//DataResultViewerTable view = Utilities.actionsGlobalContext().lookup(DataResultViewerTable.class); //DataResultViewerTable view = Utilities.actionsGlobalContext().lookup(DataResultViewerTable.class);
@ -199,11 +199,11 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
final int lastTerm = terms.size() - 1; final int lastTerm = terms.size() - 1;
int curTerm = 0; int curTerm = 0;
for (Term term : terms) { for (Term term : terms) {
final String termS = KeywordSearchUtil.escapeLuceneQuery(term.getTerm(), true); final String termS = KeywordSearchUtil.escapeLuceneQuery(term.getTerm(), true, false);
if (!termS.contains("*")) { if (!termS.contains("*")) {
highlightQuery.append(termS); highlightQuery.append(termS);
if (lastTerm != curTerm) { if (lastTerm != curTerm) {
highlightQuery.append(" "); highlightQuery.append(" "); //acts as OR ||
} }
} }
} }
@ -304,25 +304,27 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(content); final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(content);
//make sure the file contains a match (this gets rid of large number of false positives) //postprocess
//TODO option in GUI to include approximate matches (faster) //make sure Solr result contains a match (this gets rid of large number of false positives)
boolean matchFound = false; boolean postprocess = true;
boolean matchFound = true;
if (postprocess) {
if (contentStr != null) {//if not null, some error getting from Solr, handle it by not filtering out if (contentStr != null) {//if not null, some error getting from Solr, handle it by not filtering out
//perform java regex to validate match from Solr //perform java regex to validate match from Solr
String origQuery = thingContent.getQuery(); String origQuery = thingContent.getQuery();
//escape the regex query because it may contain special characters from the previous match //since query is a match result, we can assume literal pattern
//since it's a match result, we can assume literal pattern
origQuery = Pattern.quote(origQuery); origQuery = Pattern.quote(origQuery);
Pattern p = Pattern.compile(origQuery, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Pattern p = Pattern.compile(origQuery, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher m = p.matcher(contentStr); Matcher m = p.matcher(contentStr);
matchFound = m.find(); matchFound = m.find();
} }
}
if (matchFound) { if (matchFound) {
Node kvNode = new KeyValueNode(thingContent, Children.LEAF); Node kvNode = new KeyValueNode(thingContent, Children.LEAF);
//wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization //wrap in KeywordSearchFilterNode for the markup content
HighlightedMatchesSource highlights = new HighlightedMatchesSource(content, query); HighlightedMatchesSource highlights = new HighlightedMatchesSource(content, query);
return new KeywordSearchFilterNode(highlights, kvNode, query); return new KeywordSearchFilterNode(highlights, kvNode, query);
} else { } else {

View File

@ -16,7 +16,6 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.awt.Component; import java.awt.Component;
@ -31,8 +30,10 @@ import org.sleuthkit.datamodel.TskException;
public class KeywordSearchUtil { public class KeywordSearchUtil {
public enum DIALOG_MESSAGE_TYPE {ERROR, WARN, INFO}; public enum DIALOG_MESSAGE_TYPE {
ERROR, WARN, INFO
};
private static final Logger logger = Logger.getLogger(KeywordSearchUtil.class.getName()); private static final Logger logger = Logger.getLogger(KeywordSearchUtil.class.getName());
public static String buildDirName(FsContent f) { public static String buildDirName(FsContent f) {
@ -65,7 +66,7 @@ public class KeywordSearchUtil {
* such as /+-&|!(){}[]^"~*?:\ and treat the whole query as literal word * such as /+-&|!(){}[]^"~*?:\ and treat the whole query as literal word
* @return encoded query * @return encoded query
*/ */
public static String escapeLuceneQuery(String query, boolean escapeLuceneChars) { public static String escapeLuceneQuery(String query, boolean escapeLuceneChars, boolean encode) {
String queryEscaped = null; String queryEscaped = null;
String inputString = query; String inputString = query;
@ -79,27 +80,29 @@ public class KeywordSearchUtil {
} }
sb.append(c); sb.append(c);
} }
inputString = sb.toString(); queryEscaped = inputString = sb.toString();
} }
if (encode) {
try { try {
queryEscaped = URLEncoder.encode(inputString, "UTF-8"); queryEscaped = URLEncoder.encode(inputString, "UTF-8");
} } catch (UnsupportedEncodingException ex) {
catch (UnsupportedEncodingException ex) {
logger.log(Level.SEVERE, "Error escaping URL query, should not happen.", ex); logger.log(Level.SEVERE, "Error escaping URL query, should not happen.", ex);
queryEscaped = query; queryEscaped = query;
} }
}
return queryEscaped; return queryEscaped;
} }
public static void displayDialog(final String title, final String message, final DIALOG_MESSAGE_TYPE type) { public static void displayDialog(final String title, final String message, final DIALOG_MESSAGE_TYPE type) {
int messageType; int messageType;
if (type == DIALOG_MESSAGE_TYPE.ERROR) if (type == DIALOG_MESSAGE_TYPE.ERROR) {
messageType = JOptionPane.ERROR_MESSAGE; messageType = JOptionPane.ERROR_MESSAGE;
else if (type == DIALOG_MESSAGE_TYPE.WARN) } else if (type == DIALOG_MESSAGE_TYPE.WARN) {
messageType = JOptionPane.WARNING_MESSAGE; messageType = JOptionPane.WARNING_MESSAGE;
else messageType = JOptionPane.INFORMATION_MESSAGE; } else {
messageType = JOptionPane.INFORMATION_MESSAGE;
}
final Component parentComponent = null; // Use default window frame. final Component parentComponent = null; // Use default window frame.
JOptionPane.showMessageDialog( JOptionPane.showMessageDialog(
@ -108,4 +111,20 @@ public class KeywordSearchUtil {
title, title,
messageType); messageType);
} }
public static boolean displayConfirmDialog(final String title, final String message, final DIALOG_MESSAGE_TYPE type) {
int messageType;
if (type == DIALOG_MESSAGE_TYPE.ERROR) {
messageType = JOptionPane.ERROR_MESSAGE;
} else if (type == DIALOG_MESSAGE_TYPE.WARN) {
messageType = JOptionPane.WARNING_MESSAGE;
} else {
messageType = JOptionPane.INFORMATION_MESSAGE;
}
if (JOptionPane.showConfirmDialog(null, message, title, JOptionPane.YES_NO_OPTION, messageType) == JOptionPane.YES_OPTION) {
return true;
} else {
return false;
}
}
} }

View File

@ -56,7 +56,7 @@ public class LuceneQuery implements KeywordSearchQuery {
@Override @Override
public void escape() { public void escape() {
queryEscaped = KeywordSearchUtil.escapeLuceneQuery(query, true); queryEscaped = KeywordSearchUtil.escapeLuceneQuery(query, true, true);
isEscaped = true; isEscaped = true;
} }

View File

@ -168,11 +168,12 @@ public class TermComponentQuery implements KeywordSearchQuery {
final int lastTerm = terms.size() - 1; final int lastTerm = terms.size() - 1;
int curTerm = 0; int curTerm = 0;
for (Term term : terms) { for (Term term : terms) {
final String termS = term.getTerm(); final String termS = KeywordSearchUtil.escapeLuceneQuery(term.getTerm(), true, false);
//final String termS = term.getTerm();
if (!termS.contains("*")) { if (!termS.contains("*")) {
filesQueryB.append(termS); filesQueryB.append(TERMS_SEARCH_FIELD).append(":").append(termS);
if (curTerm != lastTerm) { if (curTerm != lastTerm) {
filesQueryB.append(" "); filesQueryB.append(" "); //acts as OR ||
} }
} }
++curTerm; ++curTerm;
@ -181,7 +182,7 @@ public class TermComponentQuery implements KeywordSearchQuery {
if (!terms.isEmpty()) { if (!terms.isEmpty()) {
LuceneQuery filesQuery = new LuceneQuery(filesQueryB.toString()); LuceneQuery filesQuery = new LuceneQuery(filesQueryB.toString());
filesQuery.escape(); //filesQuery.escape();
try { try {
uniqueMatches = filesQuery.performQuery(); uniqueMatches = filesQuery.performQuery();
} catch (RuntimeException e) { } catch (RuntimeException e) {
@ -190,8 +191,10 @@ public class TermComponentQuery implements KeywordSearchQuery {
} }
//result postprocessing
//filter out non-matching files using the original query (whether literal or not) //filter out non-matching files using the original query (whether literal or not)
//TODO this could be costly, for now just testing how it performs boolean postprocess = false;
if (postprocess) {
for (FsContent f : uniqueMatches) { for (FsContent f : uniqueMatches) {
Pattern p = Pattern.compile(queryEscaped, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Pattern p = Pattern.compile(queryEscaped, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(f); final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(f);
@ -200,6 +203,9 @@ public class TermComponentQuery implements KeywordSearchQuery {
results.add(f); results.add(f);
} }
} }
} else {
results.addAll(uniqueMatches);
}