Fixing up files that were broken by an assortment of merge and merge reverts.

This commit is contained in:
esaunders 2019-10-07 14:12:55 -04:00
parent 5864303f21
commit 680ad710e1
9 changed files with 232 additions and 38 deletions

View File

@ -21,6 +21,7 @@
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
<dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
<dependency conf="autopsy->*" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/>
<!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
<dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
<exclude module="cxf-rt-rs-client"/>

View File

@ -29,6 +29,7 @@ file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
file.reference.language-detector-0.6.jar=release\\modules\\ext\\language-detector-0.6.jar
file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar

View File

@ -230,10 +230,6 @@
<package>org.codehaus.stax2.validation</package>
<package>org.noggit</package>
<package>org.sleuthkit.autopsy.keywordsearch</package>
<package>org.slf4j</package>
<package>org.slf4j.event</package>
<package>org.slf4j.helpers</package>
<package>org.slf4j.spi</package>
</public-packages>
<class-path-extension>
<runtime-relative-path>ext/commons-digester-1.8.1.jar</runtime-relative-path>
@ -283,6 +279,10 @@
<runtime-relative-path>ext/guava-17.0.jar</runtime-relative-path>
<binary-origin>release/modules/ext/guava-17.0.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/language-detector-0.6.jar</runtime-relative-path>
<binary-origin>release\modules\ext\language-detector-0.6.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/joda-time-2.2.jar</runtime-relative-path>
<binary-origin>release/modules/ext/joda-time-2.2.jar</binary-origin>

View File

@ -45,7 +45,7 @@
that avoids logging every request
-->
<schema name="Autopsy Keyword Search" version="2.1">
<schema name="Autopsy Keyword Search" version="2.2">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.4" is Solr's version number for the schema syntax and semantics. It should
@ -62,6 +62,7 @@
2.0 added chunk_size field
2.1 to facilitate case insensitive regex search,no longer copying content into content_str.
content_str will be populated with lowercase content by Autopsy.
2.2 added text_ja type, content_ja and language fields to support Japanese text search
-->
<types>
@ -243,6 +244,18 @@
</analyzer>
</fieldType>
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field with defaults appropriate for English: it
tokenizes with StandardTokenizer, removes English stop words
(stopwords_en.txt), down cases, protects words from protwords.txt, and
@ -557,6 +570,11 @@
via copyField further on in this schema -->
<field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
<!-- Store language detection result. Only parents of text documents have this -->
<field name="language" type="string" indexed="false" stored="true" required="false"/>
<field name="content_ja" type="text_ja" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->

View File

@ -38,6 +38,7 @@ import org.apache.commons.lang3.math.NumberUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
@ -346,6 +347,8 @@ class HighlightedText implements IndexedText {
String chunkID = "";
String highlightField = "";
try {
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
loadPageInfo(); //inits once
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
@ -359,6 +362,22 @@ class HighlightedText implements IndexedText {
highlightField = LuceneQuery.HIGHLIGHT_FIELD;
if (isLiteral) {
if (2.2 <= indexSchemaVersion) {
//if the query is literal try to get solr to do the highlighting
final String highlightQuery = keywords.stream().map(s ->
LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
.collect(Collectors.joining(" OR "));
q.setQuery(highlightQuery);
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
q.addField(field.toString());
q.addHighlightField(field.toString());
}
q.addField(Server.Schema.LANGUAGE.toString());
// in case of single term literal query there is only 1 term
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
q.addFilterQuery(filterQuery);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
} else {
//if the query is literal try to get solr to do the highlighting
final String highlightQuery = keywords.stream()
.map(HighlightedText::constructEscapedSolrQuery)
@ -369,12 +388,20 @@ class HighlightedText implements IndexedText {
q.addFilterQuery(filterQuery);
q.addHighlightField(highlightField);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
}
//tune the highlighter
if (shouldUseOriginalHighlighter(contentIdStr)) {
// use original highlighter
q.setParam("hl.useFastVectorHighlighter", "off");
q.setParam("hl.simple.pre", HIGHLIGHT_PRE);
q.setParam("hl.simple.post", HIGHLIGHT_POST);
} else {
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
}
//docs says makes sense for the original Highlighter only, but not really
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
@ -405,6 +432,33 @@ class HighlightedText implements IndexedText {
if (responseHighlightID == null) {
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
} else {
SolrDocument document = response.getResults().get(0);
Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
if (2.2 <= indexSchemaVersion && language != null) {
List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
if (contentHighlights == null) {
highlightedContent = "";
} else {
int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
String s = contentHighlights.get(0).trim();
// If there is a mini-chunk, trim the content not to show highlighted text in it.
if (0 < hitCountInMiniChunk) {
int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
s,
HIGHLIGHT_PRE,
// trim after the last hit in chunk
hitCountInChunk - hitCountInMiniChunk);
if (idx != -1) {
highlightedContent = s.substring(0, idx);
} else {
highlightedContent = s;
}
} else {
highlightedContent = s;
}
}
} else {
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
if (contentHighlights == null) {
@ -415,6 +469,7 @@ class HighlightedText implements IndexedText {
}
}
}
}
highlightedContent = insertAnchors(highlightedContent);
return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
@ -551,4 +606,37 @@ class HighlightedText implements IndexedText {
return buf.toString();
}
/**
* Return true if we should use original highlighter instead of FastVectorHighlighter.
*
* In the case Japanese text and phrase query, FastVectorHighlighter does not work well.
*
* Note about highlighters:
* If the query is "雨が降る" (phrase query), Solr divides it into and 降る. is a stop word here.
* It seems that FastVector highlighter does not produce any snippet when there is a stop word between terms.
* On the other hand, original highlighter produces multiple matches, for example:
* > <em></em><em>降っ</em>ています
* Unified highlighter (from Solr 6.4) handles the case as expected:
* > <em>雨が降っ</em>ています
*/
private boolean shouldUseOriginalHighlighter(String contentID) throws NoOpenCoreException, KeywordSearchModuleException {
final SolrQuery q = new SolrQuery();
q.setQuery("*:*");
q.addFilterQuery(Server.Schema.ID.toString() + ":" + contentID);
q.setFields(Server.Schema.LANGUAGE.toString());
QueryResponse response = solrServer.query(q, METHOD.POST);
SolrDocumentList solrDocuments = response.getResults();
if (!solrDocuments.isEmpty()) {
SolrDocument solrDocument = solrDocuments.get(0);
if (solrDocument != null) {
Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
if (languageField != null) {
return languageField.equals("ja");
}
}
}
return false;
}
}

View File

@ -39,7 +39,7 @@ class IndexFinder {
private static final String KWS_DATA_FOLDER_NAME = "data";
private static final String INDEX_FOLDER_NAME = "index";
private static final String CURRENT_SOLR_VERSION = "4";
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.1";
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.2";
static String getCurrentSolrVersion() {
return CURRENT_SOLR_VERSION;

View File

@ -20,8 +20,10 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.BufferedReader;
import java.io.Reader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.solr.client.solrj.SolrServerException;
@ -59,6 +61,8 @@ class Ingester {
private final Server solrServer = KeywordSearch.getServer();
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
private static Ingester instance;
private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
= new LanguageSpecificContentIndexingHelper();
private Ingester() {
}
@ -93,7 +97,7 @@ class Ingester {
* file, but the Solr server is probably fine.
*/
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
indexChunk("", file.getName().toLowerCase(), getContentFields(file));
indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
}
/**
@ -107,7 +111,7 @@ class Ingester {
* artifact, but the Solr server is probably fine.
*/
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
indexChunk("", sourceName, getContentFields(artifact));
indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
}
/**
@ -143,21 +147,30 @@ class Ingester {
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
int numChunks = 0; //unknown until chunking is done
Map<String, String> fields = getContentFields(source);
Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
//Get a reader for the content of the given source
try (BufferedReader reader = new BufferedReader(sourceReader)) {
Chunker chunker = new Chunker(reader);
for (Chunk chunk : chunker) {
while (chunker.hasNext()) {
if (context != null && context.fileIngestIsCancelled()) {
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
return false;
}
Chunk chunk = chunker.next();
Map<String, Object> fields = new HashMap<>(contentFields);
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
fields.put(Server.Schema.ID.toString(), chunkId);
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
Optional<Language> language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
try {
//add the chunk text to Solr index
indexChunk(chunk.toString(), sourceName, fields);
// add mini chunk when there's a language specific field
if (chunker.hasNext() && language.isPresent()) {
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
}
numChunks++;
} catch (Ingester.IngesterException ingEx) {
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
@ -171,12 +184,13 @@ class Ingester {
return false;
}
} catch (Exception ex) {
logger.log(Level.WARNING, "Unexpected error while indexing content from " + sourceID + ": " + sourceName, ex);//NON-NLS
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} finally {
if (context != null && context.fileIngestIsCancelled()) {
return false;
} else {
Map<String, Object> fields = new HashMap<>(contentFields);
//after all chunks, index just the meta data, including the numChunks, of the parent file
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
//reset id field to base document id
@ -202,7 +216,7 @@ class Ingester {
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
//JMTODO: actually if the we couldn't get the image id it is set to -1,
// but does this really mean we don't want to index it?

View File

@ -134,6 +134,7 @@ class LuceneQuery implements KeywordSearchQuery {
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
boolean allResultsProcessed = false;
List<KeywordHit> matches = new ArrayList<>();
LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();
while (!allResultsProcessed) {
solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
@ -141,7 +142,18 @@ class LuceneQuery implements KeywordSearchQuery {
// objectId_chunk -> "text" -> List of previews
Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
if (2.2 <= indexSchemaVersion) {
languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());
}
for (SolrDocument resultDoc : resultList) {
if (2.2 <= indexSchemaVersion) {
Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());
if (language != null) {
LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);
}
}
try {
/*
* for each result doc, check that the first occurence of
@ -153,6 +165,11 @@ class LuceneQuery implements KeywordSearchQuery {
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
// if the document has language, it should be hit in language specific content fields. So skip here.
if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {
continue;
}
if (indexSchemaVersion < 2.0) {
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
matches.add(createKeywordtHit(highlightResponse, docId));
@ -179,9 +196,16 @@ class LuceneQuery implements KeywordSearchQuery {
cursorMark = nextCursorMark;
}
List<KeywordHit> mergedMatches;
if (2.2 <= indexSchemaVersion) {
mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);
} else {
mergedMatches = matches;
}
QueryResults results = new QueryResults(this);
//in case of single term literal query there is only 1 term
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches);
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);
return results;
}
@ -262,7 +286,9 @@ class LuceneQuery implements KeywordSearchQuery {
*
* @return
*/
private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
// Wrap the query string in quotes if this is a literal search term.
@ -273,8 +299,12 @@ class LuceneQuery implements KeywordSearchQuery {
if (field != null) {
//use the optional field
queryStr = field + ":" + queryStr;
}
q.setQuery(queryStr);
} else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(queryStr));
} else {
q.setQuery(queryStr);
}
q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
// Setting the sort order is necessary for cursor based paging to work.
q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
@ -283,6 +313,11 @@ class LuceneQuery implements KeywordSearchQuery {
Server.Schema.CHUNK_SIZE.toString(),
Server.Schema.CONTENT_STR.toString());
if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
q.addField(Server.Schema.LANGUAGE.toString());
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);
}
for (KeywordQueryFilter filter : filters) {
q.addFilterQuery(filter.toString());
}
@ -300,8 +335,16 @@ class LuceneQuery implements KeywordSearchQuery {
*
* @param q The SolrQuery to configure.
*/
private static void configurwQueryForHighlighting(SolrQuery q) {
private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
if (2.2 <= indexSchemaVersion) {
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
q.addHighlightField(field.toString());
}
} else {
q.addHighlightField(HIGHLIGHT_FIELD);
}
q.setHighlightSnippets(1);
q.setHighlightFragsize(SNIPPET_LENGTH);
@ -404,7 +447,13 @@ class LuceneQuery implements KeywordSearchQuery {
if (responseHighlightID == null) {
return "";
}
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
List<String> contentHighlights;
if (2.2 <= indexSchemaVersion) {
contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
} else {
contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
}
if (contentHighlights == null) {
return "";
} else {

View File

@ -130,6 +130,18 @@ public class Server {
return "content_ws"; //NON-NLS
}
},
CONTENT_JA {
@Override
public String toString() {
return "content_ja"; //NON-NLS
}
},
LANGUAGE {
@Override
public String toString() {
return "language"; //NON-NLS
}
},
FILE_NAME {
@Override
public String toString() {
@ -175,6 +187,17 @@ public class Server {
public String toString() {
return "chunk_size"; //NON-NLS
}
},
/**
* termfreq is a function which returns the number of times the term appears.
* This is not an actual field defined in schema.xml, but can be gotten from returned documents
* in the same way as fields.
*/
TERMFREQ {
@Override
public String toString() {
return "termfreq"; //NON-NLS
}
}
};