Fixing up files that were broken by an assortment of merge and merge reverts.

This commit is contained in:
esaunders 2019-10-07 14:12:55 -04:00
parent 5864303f21
commit 680ad710e1
9 changed files with 232 additions and 38 deletions

View File

@ -21,6 +21,7 @@
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/> <dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/> <dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
<dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/> <dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
<dependency conf="autopsy->*" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/>
<!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy --> <!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
<dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20"> <dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
<exclude module="cxf-rt-rs-client"/> <exclude module="cxf-rt-rs-client"/>

View File

@ -29,6 +29,7 @@ file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
file.reference.language-detector-0.6.jar=release\\modules\\ext\\language-detector-0.6.jar
file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar

View File

@ -230,10 +230,6 @@
<package>org.codehaus.stax2.validation</package> <package>org.codehaus.stax2.validation</package>
<package>org.noggit</package> <package>org.noggit</package>
<package>org.sleuthkit.autopsy.keywordsearch</package> <package>org.sleuthkit.autopsy.keywordsearch</package>
<package>org.slf4j</package>
<package>org.slf4j.event</package>
<package>org.slf4j.helpers</package>
<package>org.slf4j.spi</package>
</public-packages> </public-packages>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/commons-digester-1.8.1.jar</runtime-relative-path> <runtime-relative-path>ext/commons-digester-1.8.1.jar</runtime-relative-path>
@ -283,6 +279,10 @@
<runtime-relative-path>ext/guava-17.0.jar</runtime-relative-path> <runtime-relative-path>ext/guava-17.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/guava-17.0.jar</binary-origin> <binary-origin>release/modules/ext/guava-17.0.jar</binary-origin>
</class-path-extension> </class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/language-detector-0.6.jar</runtime-relative-path>
<binary-origin>release\modules\ext\language-detector-0.6.jar</binary-origin>
</class-path-extension>
<class-path-extension> <class-path-extension>
<runtime-relative-path>ext/joda-time-2.2.jar</runtime-relative-path> <runtime-relative-path>ext/joda-time-2.2.jar</runtime-relative-path>
<binary-origin>release/modules/ext/joda-time-2.2.jar</binary-origin> <binary-origin>release/modules/ext/joda-time-2.2.jar</binary-origin>

View File

@ -45,7 +45,7 @@
that avoids logging every request that avoids logging every request
--> -->
<schema name="Autopsy Keyword Search" version="2.1"> <schema name="Autopsy Keyword Search" version="2.2">
<!-- attribute "name" is the name of this schema and is only used for display purposes. <!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection. Applications should change this to reflect the nature of the search collection.
version="1.4" is Solr's version number for the schema syntax and semantics. It should version="1.4" is Solr's version number for the schema syntax and semantics. It should
@ -62,6 +62,7 @@
2.0 added chunk_size field 2.0 added chunk_size field
2.1 to facilitate case insensitive regex search,no longer copying content into content_str. 2.1 to facilitate case insensitive regex search,no longer copying content into content_str.
content_str will be populated with lowercase content by Autopsy. content_str will be populated with lowercase content by Autopsy.
2.2 added text_ja type, content_ja and language fields to support Japanese text search
--> -->
<types> <types>
@ -243,6 +244,18 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field with defaults appropriate for English: it <!-- A text field with defaults appropriate for English: it
tokenizes with StandardTokenizer, removes English stop words tokenizes with StandardTokenizer, removes English stop words
(stopwords_en.txt), down cases, protects words from protwords.txt, and (stopwords_en.txt), down cases, protects words from protwords.txt, and
@ -557,6 +570,11 @@
via copyField further on in this schema --> via copyField further on in this schema -->
<field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/> <field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
<!-- Store language detection result. Only parents of text documents have this -->
<field name="language" type="string" indexed="false" stored="true" required="false"/>
<field name="content_ja" type="text_ja" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient <!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. --> leading wildcard queries. -->
<!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>--> <!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->

View File

@ -38,6 +38,7 @@ import org.apache.commons.lang3.math.NumberUtils;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest.METHOD; import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
@ -346,6 +347,8 @@ class HighlightedText implements IndexedText {
String chunkID = ""; String chunkID = "";
String highlightField = ""; String highlightField = "";
try { try {
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
loadPageInfo(); //inits once loadPageInfo(); //inits once
SolrQuery q = new SolrQuery(); SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug q.setShowDebugInfo(DEBUG); //debug
@ -359,22 +362,46 @@ class HighlightedText implements IndexedText {
highlightField = LuceneQuery.HIGHLIGHT_FIELD; highlightField = LuceneQuery.HIGHLIGHT_FIELD;
if (isLiteral) { if (isLiteral) {
//if the query is literal try to get solr to do the highlighting if (2.2 <= indexSchemaVersion) {
final String highlightQuery = keywords.stream() //if the query is literal try to get solr to do the highlighting
.map(HighlightedText::constructEscapedSolrQuery) final String highlightQuery = keywords.stream().map(s ->
.collect(Collectors.joining(" ")); LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
.collect(Collectors.joining(" OR "));
q.setQuery(highlightQuery);
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
q.addField(field.toString());
q.addHighlightField(field.toString());
}
q.addField(Server.Schema.LANGUAGE.toString());
// in case of single term literal query there is only 1 term
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
q.addFilterQuery(filterQuery);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
} else {
//if the query is literal try to get solr to do the highlighting
final String highlightQuery = keywords.stream()
.map(HighlightedText::constructEscapedSolrQuery)
.collect(Collectors.joining(" "));
q.setQuery(highlightQuery); q.setQuery(highlightQuery);
q.addField(highlightField); q.addField(highlightField);
q.addFilterQuery(filterQuery); q.addFilterQuery(filterQuery);
q.addHighlightField(highlightField); q.addHighlightField(highlightField);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
}
//tune the highlighter //tune the highlighter
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS if (shouldUseOriginalHighlighter(contentIdStr)) {
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS // use original highlighter
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS q.setParam("hl.useFastVectorHighlighter", "off");
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS q.setParam("hl.simple.pre", HIGHLIGHT_PRE);
q.setParam("hl.simple.post", HIGHLIGHT_POST);
} else {
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
}
//docs says makes sense for the original Highlighter only, but not really //docs says makes sense for the original Highlighter only, but not really
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
@ -406,12 +433,40 @@ class HighlightedText implements IndexedText {
if (responseHighlightID == null) { if (responseHighlightID == null) {
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords); highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
} else { } else {
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD); SolrDocument document = response.getResults().get(0);
if (contentHighlights == null) { Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords); if (2.2 <= indexSchemaVersion && language != null) {
List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
if (contentHighlights == null) {
highlightedContent = "";
} else {
int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
String s = contentHighlights.get(0).trim();
// If there is a mini-chunk, trim the content not to show highlighted text in it.
if (0 < hitCountInMiniChunk) {
int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
s,
HIGHLIGHT_PRE,
// trim after the last hit in chunk
hitCountInChunk - hitCountInMiniChunk);
if (idx != -1) {
highlightedContent = s.substring(0, idx);
} else {
highlightedContent = s;
}
} else {
highlightedContent = s;
}
}
} else { } else {
// extracted content (minus highlight tags) is HTML-escaped List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
highlightedContent = contentHighlights.get(0).trim(); if (contentHighlights == null) {
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
} else {
// extracted content (minus highlight tags) is HTML-escaped
highlightedContent = contentHighlights.get(0).trim();
}
} }
} }
} }
@ -551,4 +606,37 @@ class HighlightedText implements IndexedText {
return buf.toString(); return buf.toString();
} }
/**
* Return true if we should use original highlighter instead of FastVectorHighlighter.
*
* In the case Japanese text and phrase query, FastVectorHighlighter does not work well.
*
* Note about highlighters:
* If the query is "雨が降る" (phrase query), Solr divides it into and 降る. is a stop word here.
* It seems that FastVector highlighter does not produce any snippet when there is a stop word between terms.
* On the other hand, original highlighter produces multiple matches, for example:
* > <em></em><em>降っ</em>ています
* Unified highlighter (from Solr 6.4) handles the case as expected:
* > <em>雨が降っ</em>ています
*/
private boolean shouldUseOriginalHighlighter(String contentID) throws NoOpenCoreException, KeywordSearchModuleException {
final SolrQuery q = new SolrQuery();
q.setQuery("*:*");
q.addFilterQuery(Server.Schema.ID.toString() + ":" + contentID);
q.setFields(Server.Schema.LANGUAGE.toString());
QueryResponse response = solrServer.query(q, METHOD.POST);
SolrDocumentList solrDocuments = response.getResults();
if (!solrDocuments.isEmpty()) {
SolrDocument solrDocument = solrDocuments.get(0);
if (solrDocument != null) {
Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
if (languageField != null) {
return languageField.equals("ja");
}
}
}
return false;
}
} }

View File

@ -39,7 +39,7 @@ class IndexFinder {
private static final String KWS_DATA_FOLDER_NAME = "data"; private static final String KWS_DATA_FOLDER_NAME = "data";
private static final String INDEX_FOLDER_NAME = "index"; private static final String INDEX_FOLDER_NAME = "index";
private static final String CURRENT_SOLR_VERSION = "4"; private static final String CURRENT_SOLR_VERSION = "4";
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.1"; private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.2";
static String getCurrentSolrVersion() { static String getCurrentSolrVersion() {
return CURRENT_SOLR_VERSION; return CURRENT_SOLR_VERSION;

View File

@ -20,8 +20,10 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.Reader; import java.io.Reader;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Optional;
import java.util.logging.Level; import java.util.logging.Level;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
@ -59,6 +61,8 @@ class Ingester {
private final Server solrServer = KeywordSearch.getServer(); private final Server solrServer = KeywordSearch.getServer();
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor(); private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
private static Ingester instance; private static Ingester instance;
private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
= new LanguageSpecificContentIndexingHelper();
private Ingester() { private Ingester() {
} }
@ -93,7 +97,7 @@ class Ingester {
* file, but the Solr server is probably fine. * file, but the Solr server is probably fine.
*/ */
void indexMetaDataOnly(AbstractFile file) throws IngesterException { void indexMetaDataOnly(AbstractFile file) throws IngesterException {
indexChunk("", file.getName().toLowerCase(), getContentFields(file)); indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
} }
/** /**
@ -107,7 +111,7 @@ class Ingester {
* artifact, but the Solr server is probably fine. * artifact, but the Solr server is probably fine.
*/ */
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException { void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
indexChunk("", sourceName, getContentFields(artifact)); indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
} }
/** /**
@ -143,21 +147,30 @@ class Ingester {
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException { < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
int numChunks = 0; //unknown until chunking is done int numChunks = 0; //unknown until chunking is done
Map<String, String> fields = getContentFields(source); Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
//Get a reader for the content of the given source //Get a reader for the content of the given source
try (BufferedReader reader = new BufferedReader(sourceReader)) { try (BufferedReader reader = new BufferedReader(sourceReader)) {
Chunker chunker = new Chunker(reader); Chunker chunker = new Chunker(reader);
for (Chunk chunk : chunker) { while (chunker.hasNext()) {
if (context != null && context.fileIngestIsCancelled()) { if (context != null && context.fileIngestIsCancelled()) {
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName); logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
return false; return false;
} }
Chunk chunk = chunker.next();
Map<String, Object> fields = new HashMap<>(contentFields);
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1); String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
fields.put(Server.Schema.ID.toString(), chunkId); fields.put(Server.Schema.ID.toString(), chunkId);
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength())); fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
Optional<Language> language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
try { try {
//add the chunk text to Solr index //add the chunk text to Solr index
indexChunk(chunk.toString(), sourceName, fields); indexChunk(chunk.toString(), sourceName, fields);
// add mini chunk when there's a language specific field
if (chunker.hasNext() && language.isPresent()) {
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
}
numChunks++; numChunks++;
} catch (Ingester.IngesterException ingEx) { } catch (Ingester.IngesterException ingEx) {
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
@ -171,12 +184,13 @@ class Ingester {
return false; return false;
} }
} catch (Exception ex) { } catch (Exception ex) {
logger.log(Level.WARNING, "Unexpected error while indexing content from " + sourceID + ": " + sourceName, ex);//NON-NLS logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false; return false;
} finally { } finally {
if (context != null && context.fileIngestIsCancelled()) { if (context != null && context.fileIngestIsCancelled()) {
return false; return false;
} else { } else {
Map<String, Object> fields = new HashMap<>(contentFields);
//after all chunks, index just the meta data, including the numChunks, of the parent file //after all chunks, index just the meta data, including the numChunks, of the parent file
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
//reset id field to base document id //reset id field to base document id
@ -202,7 +216,7 @@ class Ingester {
* *
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/ */
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException { private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
//JMTODO: actually if the we couldn't get the image id it is set to -1, //JMTODO: actually if the we couldn't get the image id it is set to -1,
// but does this really mean we don't want to index it? // but does this really mean we don't want to index it?

View File

@ -134,6 +134,7 @@ class LuceneQuery implements KeywordSearchQuery {
String cursorMark = CursorMarkParams.CURSOR_MARK_START; String cursorMark = CursorMarkParams.CURSOR_MARK_START;
boolean allResultsProcessed = false; boolean allResultsProcessed = false;
List<KeywordHit> matches = new ArrayList<>(); List<KeywordHit> matches = new ArrayList<>();
LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();
while (!allResultsProcessed) { while (!allResultsProcessed) {
solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark); solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST); QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
@ -141,7 +142,18 @@ class LuceneQuery implements KeywordSearchQuery {
// objectId_chunk -> "text" -> List of previews // objectId_chunk -> "text" -> List of previews
Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting(); Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
if (2.2 <= indexSchemaVersion) {
languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());
}
for (SolrDocument resultDoc : resultList) { for (SolrDocument resultDoc : resultList) {
if (2.2 <= indexSchemaVersion) {
Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());
if (language != null) {
LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);
}
}
try { try {
/* /*
* for each result doc, check that the first occurence of * for each result doc, check that the first occurence of
@ -153,6 +165,11 @@ class LuceneQuery implements KeywordSearchQuery {
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString()); final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString()); final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
// if the document has language, it should be hit in language specific content fields. So skip here.
if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {
continue;
}
if (indexSchemaVersion < 2.0) { if (indexSchemaVersion < 2.0) {
//old schema versions don't support chunk_size or the content_str fields, so just accept hits //old schema versions don't support chunk_size or the content_str fields, so just accept hits
matches.add(createKeywordtHit(highlightResponse, docId)); matches.add(createKeywordtHit(highlightResponse, docId));
@ -179,9 +196,16 @@ class LuceneQuery implements KeywordSearchQuery {
cursorMark = nextCursorMark; cursorMark = nextCursorMark;
} }
List<KeywordHit> mergedMatches;
if (2.2 <= indexSchemaVersion) {
mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);
} else {
mergedMatches = matches;
}
QueryResults results = new QueryResults(this); QueryResults results = new QueryResults(this);
//in case of single term literal query there is only 1 term //in case of single term literal query there is only 1 term
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches); results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);
return results; return results;
} }
@ -262,19 +286,25 @@ class LuceneQuery implements KeywordSearchQuery {
* *
* @return * @return
*/ */
private SolrQuery createAndConfigureSolrQuery(boolean snippets) { private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
SolrQuery q = new SolrQuery(); SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug q.setShowDebugInfo(DEBUG); //debug
// Wrap the query string in quotes if this is a literal search term. // Wrap the query string in quotes if this is a literal search term.
String queryStr = originalKeyword.searchTermIsLiteral() String queryStr = originalKeyword.searchTermIsLiteral()
? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped; ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
// Run the query against an optional alternative field. // Run the query against an optional alternative field.
if (field != null) { if (field != null) {
//use the optional field //use the optional field
queryStr = field + ":" + queryStr; queryStr = field + ":" + queryStr;
q.setQuery(queryStr);
} else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(queryStr));
} else {
q.setQuery(queryStr);
} }
q.setQuery(queryStr);
q.setRows(MAX_RESULTS_PER_CURSOR_MARK); q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
// Setting the sort order is necessary for cursor based paging to work. // Setting the sort order is necessary for cursor based paging to work.
q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString())); q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
@ -283,6 +313,11 @@ class LuceneQuery implements KeywordSearchQuery {
Server.Schema.CHUNK_SIZE.toString(), Server.Schema.CHUNK_SIZE.toString(),
Server.Schema.CONTENT_STR.toString()); Server.Schema.CONTENT_STR.toString());
if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
q.addField(Server.Schema.LANGUAGE.toString());
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);
}
for (KeywordQueryFilter filter : filters) { for (KeywordQueryFilter filter : filters) {
q.addFilterQuery(filter.toString()); q.addFilterQuery(filter.toString());
} }
@ -300,8 +335,16 @@ class LuceneQuery implements KeywordSearchQuery {
* *
* @param q The SolrQuery to configure. * @param q The SolrQuery to configure.
*/ */
private static void configurwQueryForHighlighting(SolrQuery q) { private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {
q.addHighlightField(HIGHLIGHT_FIELD); double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
if (2.2 <= indexSchemaVersion) {
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
q.addHighlightField(field.toString());
}
} else {
q.addHighlightField(HIGHLIGHT_FIELD);
}
q.setHighlightSnippets(1); q.setHighlightSnippets(1);
q.setHighlightFragsize(SNIPPET_LENGTH); q.setHighlightFragsize(SNIPPET_LENGTH);
@ -404,7 +447,13 @@ class LuceneQuery implements KeywordSearchQuery {
if (responseHighlightID == null) { if (responseHighlightID == null) {
return ""; return "";
} }
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD); double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
List<String> contentHighlights;
if (2.2 <= indexSchemaVersion) {
contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
} else {
contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
}
if (contentHighlights == null) { if (contentHighlights == null) {
return ""; return "";
} else { } else {

View File

@ -130,6 +130,18 @@ public class Server {
return "content_ws"; //NON-NLS return "content_ws"; //NON-NLS
} }
}, },
CONTENT_JA {
@Override
public String toString() {
return "content_ja"; //NON-NLS
}
},
LANGUAGE {
@Override
public String toString() {
return "language"; //NON-NLS
}
},
FILE_NAME { FILE_NAME {
@Override @Override
public String toString() { public String toString() {
@ -175,7 +187,18 @@ public class Server {
public String toString() { public String toString() {
return "chunk_size"; //NON-NLS return "chunk_size"; //NON-NLS
} }
} },
/**
* termfreq is a function which returns the number of times the term appears.
* This is not an actual field defined in schema.xml, but can be gotten from returned documents
* in the same way as fields.
*/
TERMFREQ {
@Override
public String toString() {
return "termfreq"; //NON-NLS
}
}
}; };
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented) public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)