Increase query accuracy and reduce query postprocessing needed (at least for case insensitive queries) by quering only the ws delim Solr field in the aggregate query to get file ids

This commit is contained in:
adam-m 2012-01-10 17:12:29 -05:00
parent 6a6753868e
commit 52b29a2892
5 changed files with 113 additions and 86 deletions

View File

@ -57,11 +57,11 @@ class HighlightedMatchesSource implements MarkupSource {
public String getMarkup() { public String getMarkup() {
SolrQuery q = new SolrQuery(); SolrQuery q = new SolrQuery();
final String queryEscaped = KeywordSearchUtil.escapeLuceneQuery(solrQuery, true); final String queryEscaped = KeywordSearchUtil.escapeLuceneQuery(solrQuery, true, false);
q.setQuery(queryEscaped); q.setQuery(queryEscaped);
q.addFilterQuery("id:" + content.getId()); q.addFilterQuery("id:" + content.getId());
q.addHighlightField("content"); q.addHighlightField("content"); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)
q.setHighlightSimplePre(HIGHLIGHT_PRE); q.setHighlightSimplePre(HIGHLIGHT_PRE);
q.setHighlightSimplePost(HIGHLIGHT_POST); q.setHighlightSimplePost(HIGHLIGHT_POST);
q.setHighlightFragsize(0); // don't fragment the highlight q.setHighlightFragsize(0); // don't fragment the highlight

View File

@ -70,8 +70,7 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
public String toString() { public String toString() {
return "Match"; return "Match";
} }
}, },}
}
private Presentation presentation; private Presentation presentation;
private Collection<String> queries; private Collection<String> queries;
private Collection<KeyValueThing> things; private Collection<KeyValueThing> things;
@ -149,6 +148,7 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
childFactory = new ResultCollapsedChildFactory(thing); childFactory = new ResultCollapsedChildFactory(thing);
final Node ret = new KeyValueNode(thing, Children.create(childFactory, true)); final Node ret = new KeyValueNode(thing, Children.create(childFactory, true));
SwingUtilities.invokeLater(new Runnable() { SwingUtilities.invokeLater(new Runnable() {
@Override @Override
public void run() { public void run() {
//DataResultViewerTable view = Utilities.actionsGlobalContext().lookup(DataResultViewerTable.class); //DataResultViewerTable view = Utilities.actionsGlobalContext().lookup(DataResultViewerTable.class);
@ -199,11 +199,11 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
final int lastTerm = terms.size() - 1; final int lastTerm = terms.size() - 1;
int curTerm = 0; int curTerm = 0;
for (Term term : terms) { for (Term term : terms) {
final String termS = KeywordSearchUtil.escapeLuceneQuery(term.getTerm(), true); final String termS = KeywordSearchUtil.escapeLuceneQuery(term.getTerm(), true, false);
if (!termS.contains("*")) { if (!termS.contains("*")) {
highlightQuery.append(termS); highlightQuery.append(termS);
if (lastTerm != curTerm) { if (lastTerm != curTerm) {
highlightQuery.append(" "); highlightQuery.append(" "); //acts as OR ||
} }
} }
} }
@ -304,25 +304,27 @@ public class KeywordSearchResultFactory extends ChildFactory<KeyValueThing> {
final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(content); final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(content);
//make sure the file contains a match (this gets rid of large number of false positives) //postprocess
//TODO option in GUI to include approximate matches (faster) //make sure Solr result contains a match (this gets rid of large number of false positives)
boolean matchFound = false; boolean postprocess = true;
if (contentStr != null) {//if not null, some error getting from Solr, handle it by not filtering out boolean matchFound = true;
//perform java regex to validate match from Solr if (postprocess) {
String origQuery = thingContent.getQuery(); if (contentStr != null) {//if not null, some error getting from Solr, handle it by not filtering out
//perform java regex to validate match from Solr
String origQuery = thingContent.getQuery();
//since query is a match result, we can assume literal pattern
origQuery = Pattern.quote(origQuery);
Pattern p = Pattern.compile(origQuery, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
//escape the regex query because it may contain special characters from the previous match Matcher m = p.matcher(contentStr);
//since it's a match result, we can assume literal pattern matchFound = m.find();
origQuery = Pattern.quote(origQuery); }
Pattern p = Pattern.compile(origQuery, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher m = p.matcher(contentStr);
matchFound = m.find();
} }
if (matchFound) { if (matchFound) {
Node kvNode = new KeyValueNode(thingContent, Children.LEAF); Node kvNode = new KeyValueNode(thingContent, Children.LEAF);
//wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization //wrap in KeywordSearchFilterNode for the markup content
HighlightedMatchesSource highlights = new HighlightedMatchesSource(content, query); HighlightedMatchesSource highlights = new HighlightedMatchesSource(content, query);
return new KeywordSearchFilterNode(highlights, kvNode, query); return new KeywordSearchFilterNode(highlights, kvNode, query);
} else { } else {

View File

@ -16,7 +16,6 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.awt.Component; import java.awt.Component;
@ -31,8 +30,10 @@ import org.sleuthkit.datamodel.TskException;
public class KeywordSearchUtil { public class KeywordSearchUtil {
public enum DIALOG_MESSAGE_TYPE {ERROR, WARN, INFO}; public enum DIALOG_MESSAGE_TYPE {
ERROR, WARN, INFO
};
private static final Logger logger = Logger.getLogger(KeywordSearchUtil.class.getName()); private static final Logger logger = Logger.getLogger(KeywordSearchUtil.class.getName());
public static String buildDirName(FsContent f) { public static String buildDirName(FsContent f) {
@ -65,42 +66,44 @@ public class KeywordSearchUtil {
* such as /+-&|!(){}[]^"~*?:\ and treat the whole query as literal word * such as /+-&|!(){}[]^"~*?:\ and treat the whole query as literal word
* @return encoded query * @return encoded query
*/ */
public static String escapeLuceneQuery(String query, boolean escapeLuceneChars) { public static String escapeLuceneQuery(String query, boolean escapeLuceneChars, boolean encode) {
String queryEscaped = null; String queryEscaped = null;
String inputString = query; String inputString = query;
if (escapeLuceneChars == true) { if (escapeLuceneChars == true) {
final String ESCAPE_CHARS = "/+-&|!(){}[]^\"~*?:\\"; final String ESCAPE_CHARS = "/+-&|!(){}[]^\"~*?:\\";
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (int i = 0; i< inputString.length(); ++i) { for (int i = 0; i < inputString.length(); ++i) {
char c = inputString.charAt(i); char c = inputString.charAt(i);
if (ESCAPE_CHARS.contains(Character.toString(c)) ) { if (ESCAPE_CHARS.contains(Character.toString(c))) {
sb.append("\\"); sb.append("\\");
} }
sb.append(c); sb.append(c);
} }
inputString = sb.toString(); queryEscaped = inputString = sb.toString();
} }
try { if (encode) {
queryEscaped = URLEncoder.encode(inputString, "UTF-8"); try {
} queryEscaped = URLEncoder.encode(inputString, "UTF-8");
catch (UnsupportedEncodingException ex) { } catch (UnsupportedEncodingException ex) {
logger.log(Level.SEVERE, "Error escaping URL query, should not happen.", ex); logger.log(Level.SEVERE, "Error escaping URL query, should not happen.", ex);
queryEscaped = query; queryEscaped = query;
}
} }
return queryEscaped; return queryEscaped;
} }
public static void displayDialog(final String title, final String message, final DIALOG_MESSAGE_TYPE type) { public static void displayDialog(final String title, final String message, final DIALOG_MESSAGE_TYPE type) {
int messageType; int messageType;
if (type == DIALOG_MESSAGE_TYPE.ERROR) if (type == DIALOG_MESSAGE_TYPE.ERROR) {
messageType = JOptionPane.ERROR_MESSAGE; messageType = JOptionPane.ERROR_MESSAGE;
else if (type == DIALOG_MESSAGE_TYPE.WARN) } else if (type == DIALOG_MESSAGE_TYPE.WARN) {
messageType = JOptionPane.WARNING_MESSAGE; messageType = JOptionPane.WARNING_MESSAGE;
else messageType = JOptionPane.INFORMATION_MESSAGE; } else {
messageType = JOptionPane.INFORMATION_MESSAGE;
}
final Component parentComponent = null; // Use default window frame. final Component parentComponent = null; // Use default window frame.
JOptionPane.showMessageDialog( JOptionPane.showMessageDialog(
parentComponent, parentComponent,
@ -108,4 +111,20 @@ public class KeywordSearchUtil {
title, title,
messageType); messageType);
} }
public static boolean displayConfirmDialog(final String title, final String message, final DIALOG_MESSAGE_TYPE type) {
int messageType;
if (type == DIALOG_MESSAGE_TYPE.ERROR) {
messageType = JOptionPane.ERROR_MESSAGE;
} else if (type == DIALOG_MESSAGE_TYPE.WARN) {
messageType = JOptionPane.WARNING_MESSAGE;
} else {
messageType = JOptionPane.INFORMATION_MESSAGE;
}
if (JOptionPane.showConfirmDialog(null, message, title, JOptionPane.YES_NO_OPTION, messageType) == JOptionPane.YES_OPTION) {
return true;
} else {
return false;
}
}
} }

View File

@ -56,7 +56,7 @@ public class LuceneQuery implements KeywordSearchQuery {
@Override @Override
public void escape() { public void escape() {
queryEscaped = KeywordSearchUtil.escapeLuceneQuery(query, true); queryEscaped = KeywordSearchUtil.escapeLuceneQuery(query, true, true);
isEscaped = true; isEscaped = true;
} }

View File

@ -49,7 +49,7 @@ import org.sleuthkit.autopsy.keywordsearch.KeywordSearchQueryManager.Presentatio
import org.sleuthkit.datamodel.FsContent; import org.sleuthkit.datamodel.FsContent;
public class TermComponentQuery implements KeywordSearchQuery { public class TermComponentQuery implements KeywordSearchQuery {
private static final int TERMS_UNLIMITED = -1; private static final int TERMS_UNLIMITED = -1;
//corresponds to field in Solr schema, analyzed with white-space tokenizer only //corresponds to field in Solr schema, analyzed with white-space tokenizer only
private static final String TERMS_SEARCH_FIELD = "content_ws"; private static final String TERMS_SEARCH_FIELD = "content_ws";
@ -60,14 +60,14 @@ public class TermComponentQuery implements KeywordSearchQuery {
private String queryEscaped; private String queryEscaped;
private boolean isEscaped; private boolean isEscaped;
private List<Term> terms; private List<Term> terms;
public TermComponentQuery(String query) { public TermComponentQuery(String query) {
this.termsQuery = query; this.termsQuery = query;
this.queryEscaped = query; this.queryEscaped = query;
isEscaped = false; isEscaped = false;
terms = null; terms = null;
} }
@Override @Override
public void escape() { public void escape() {
//treat as literal //treat as literal
@ -77,13 +77,13 @@ public class TermComponentQuery implements KeywordSearchQuery {
queryEscaped = Pattern.quote(termsQuery); queryEscaped = Pattern.quote(termsQuery);
isEscaped = true; isEscaped = true;
} }
@Override @Override
public boolean validate() { public boolean validate() {
if (queryEscaped.equals("")) { if (queryEscaped.equals("")) {
return false; return false;
} }
boolean valid = true; boolean valid = true;
try { try {
Pattern.compile(queryEscaped); Pattern.compile(queryEscaped);
@ -110,9 +110,9 @@ public class TermComponentQuery implements KeywordSearchQuery {
q.setTermsRegex(queryEscaped); q.setTermsRegex(queryEscaped);
q.addTermsField(TERMS_SEARCH_FIELD); q.addTermsField(TERMS_SEARCH_FIELD);
q.setTimeAllowed(TERMS_TIMEOUT); q.setTimeAllowed(TERMS_TIMEOUT);
return q; return q;
} }
/* /*
@ -120,7 +120,7 @@ public class TermComponentQuery implements KeywordSearchQuery {
*/ */
protected List<Term> executeQuery(SolrQuery q) { protected List<Term> executeQuery(SolrQuery q) {
Server.Core solrCore = KeywordSearch.getServer().getCore(); Server.Core solrCore = KeywordSearch.getServer().getCore();
List<Term> termsCol = null; List<Term> termsCol = null;
try { try {
TermsResponse tr = solrCore.queryTerms(q); TermsResponse tr = solrCore.queryTerms(q);
@ -131,17 +131,17 @@ public class TermComponentQuery implements KeywordSearchQuery {
return null; //no need to create result view, just display error dialog return null; //no need to create result view, just display error dialog
} }
} }
@Override @Override
public String getEscapedQueryString() { public String getEscapedQueryString() {
return this.queryEscaped; return this.queryEscaped;
} }
@Override @Override
public String getQueryString() { public String getQueryString() {
return this.termsQuery; return this.termsQuery;
} }
@Override @Override
public Collection<Term> getTerms() { public Collection<Term> getTerms() {
return terms; return terms;
@ -154,7 +154,7 @@ public class TermComponentQuery implements KeywordSearchQuery {
@Override @Override
public List<FsContent> performQuery() { public List<FsContent> performQuery() {
List<FsContent> results = new ArrayList<FsContent>(); List<FsContent> results = new ArrayList<FsContent>();
final SolrQuery q = createQuery(); final SolrQuery q = createQuery();
terms = executeQuery(q); terms = executeQuery(q);
@ -168,20 +168,21 @@ public class TermComponentQuery implements KeywordSearchQuery {
final int lastTerm = terms.size() - 1; final int lastTerm = terms.size() - 1;
int curTerm = 0; int curTerm = 0;
for (Term term : terms) { for (Term term : terms) {
final String termS = term.getTerm(); final String termS = KeywordSearchUtil.escapeLuceneQuery(term.getTerm(), true, false);
//final String termS = term.getTerm();
if (!termS.contains("*")) { if (!termS.contains("*")) {
filesQueryB.append(termS); filesQueryB.append(TERMS_SEARCH_FIELD).append(":").append(termS);
if (curTerm != lastTerm) { if (curTerm != lastTerm) {
filesQueryB.append(" "); filesQueryB.append(" "); //acts as OR ||
} }
} }
++curTerm; ++curTerm;
} }
List<FsContent> uniqueMatches = new ArrayList<FsContent>(); List<FsContent> uniqueMatches = new ArrayList<FsContent>();
if (!terms.isEmpty()) { if (!terms.isEmpty()) {
LuceneQuery filesQuery = new LuceneQuery(filesQueryB.toString()); LuceneQuery filesQuery = new LuceneQuery(filesQueryB.toString());
filesQuery.escape(); //filesQuery.escape();
try { try {
uniqueMatches = filesQuery.performQuery(); uniqueMatches = filesQuery.performQuery();
} catch (RuntimeException e) { } catch (RuntimeException e) {
@ -190,28 +191,33 @@ public class TermComponentQuery implements KeywordSearchQuery {
} }
//result postprocessing
//filter out non-matching files using the original query (whether literal or not) //filter out non-matching files using the original query (whether literal or not)
//TODO this could be costly, for now just testing how it performs boolean postprocess = false;
for (FsContent f : uniqueMatches) { if (postprocess) {
Pattern p = Pattern.compile(queryEscaped, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); for (FsContent f : uniqueMatches) {
final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(f); Pattern p = Pattern.compile(queryEscaped, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher m = p.matcher(contentStr); final String contentStr = KeywordSearch.getServer().getCore().getSolrContent(f);
if (m.find()) { Matcher m = p.matcher(contentStr);
results.add(f); if (m.find()) {
results.add(f);
}
} }
} else {
results.addAll(uniqueMatches);
} }
return results; return results;
} }
@Override @Override
public void execute() { public void execute() {
SolrQuery q = createQuery(); SolrQuery q = createQuery();
logger.log(Level.INFO, "Executing TermsComponent query: " + q.toString()); logger.log(Level.INFO, "Executing TermsComponent query: " + q.toString());
final SwingWorker worker = new TermsQueryWorker(q); final SwingWorker worker = new TermsQueryWorker(q);
worker.execute(); worker.execute();
} }
@ -221,9 +227,9 @@ public class TermComponentQuery implements KeywordSearchQuery {
* @param terms * @param terms
*/ */
private void publishNodes(List<Term> terms) { private void publishNodes(List<Term> terms) {
Collection<KeyValueThing> things = new ArrayList<KeyValueThing>(); Collection<KeyValueThing> things = new ArrayList<KeyValueThing>();
Iterator<Term> it = terms.iterator(); Iterator<Term> it = terms.iterator();
int termID = 0; int termID = 0;
//long totalMatches = 0; //long totalMatches = 0;
@ -237,17 +243,17 @@ public class TermComponentQuery implements KeywordSearchQuery {
things.add(new KeyValueThing(match, kvs, ++termID)); things.add(new KeyValueThing(match, kvs, ++termID));
//totalMatches += matches; //totalMatches += matches;
} }
Node rootNode = null; Node rootNode = null;
if (things.size() > 0) { if (things.size() > 0) {
Children childThingNodes = Children childThingNodes =
Children.create(new KeywordSearchResultFactory(termsQuery, things, Presentation.DETAIL), true); Children.create(new KeywordSearchResultFactory(termsQuery, things, Presentation.DETAIL), true);
rootNode = new AbstractNode(childThingNodes); rootNode = new AbstractNode(childThingNodes);
} else { } else {
rootNode = Node.EMPTY; rootNode = Node.EMPTY;
} }
final String pathText = "Term query"; final String pathText = "Term query";
// String pathText = "RegEx query: " + termsQuery // String pathText = "RegEx query: " + termsQuery
//+ " Files with exact matches: " + Long.toString(totalMatches) + " (also listing approximate matches)"; //+ " Files with exact matches: " + Long.toString(totalMatches) + " (also listing approximate matches)";
@ -256,29 +262,29 @@ public class TermComponentQuery implements KeywordSearchQuery {
searchResultWin.requestActive(); // make it the active top component searchResultWin.requestActive(); // make it the active top component
} }
class TermsQueryWorker extends SwingWorker<List<Term>, Void> { class TermsQueryWorker extends SwingWorker<List<Term>, Void> {
private SolrQuery q; private SolrQuery q;
private ProgressHandle progress; private ProgressHandle progress;
TermsQueryWorker(SolrQuery q) { TermsQueryWorker(SolrQuery q) {
this.q = q; this.q = q;
} }
@Override @Override
protected List<Term> doInBackground() throws Exception { protected List<Term> doInBackground() throws Exception {
progress = ProgressHandleFactory.createHandle("Terms query task"); progress = ProgressHandleFactory.createHandle("Terms query task");
progress.start(); progress.start();
progress.progress("Running Terms query."); progress.progress("Running Terms query.");
terms = executeQuery(q); terms = executeQuery(q);
progress.progress("Terms query completed."); progress.progress("Terms query completed.");
return terms; return terms;
} }
@Override @Override
protected void done() { protected void done() {
if (!this.isCancelled()) { if (!this.isCancelled()) {
@ -287,7 +293,7 @@ public class TermComponentQuery implements KeywordSearchQuery {
publishNodes(terms); publishNodes(terms);
} catch (InterruptedException e) { } catch (InterruptedException e) {
logger.log(Level.INFO, "Exception while executing regex query,", e); logger.log(Level.INFO, "Exception while executing regex query,", e);
} catch (ExecutionException e) { } catch (ExecutionException e) {
logger.log(Level.INFO, "Exception while executing regex query,", e); logger.log(Level.INFO, "Exception while executing regex query,", e);
} finally { } finally {