mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
Fixing up files that were broken by an assortment of merge and merge reverts.
This commit is contained in:
parent
5864303f21
commit
680ad710e1
@ -21,6 +21,7 @@
|
|||||||
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
|
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
|
||||||
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
|
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
|
||||||
<dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
|
<dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
|
||||||
|
<dependency conf="autopsy->*" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/>
|
||||||
<!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
|
<!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
|
||||||
<dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
|
<dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
|
||||||
<exclude module="cxf-rt-rs-client"/>
|
<exclude module="cxf-rt-rs-client"/>
|
||||||
|
@ -29,6 +29,7 @@ file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
|
|||||||
file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
|
file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
|
||||||
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
|
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
|
||||||
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
|
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
|
||||||
|
file.reference.language-detector-0.6.jar=release\\modules\\ext\\language-detector-0.6.jar
|
||||||
file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
|
file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
|
||||||
file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
|
file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
|
||||||
file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar
|
file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar
|
||||||
|
@ -230,10 +230,6 @@
|
|||||||
<package>org.codehaus.stax2.validation</package>
|
<package>org.codehaus.stax2.validation</package>
|
||||||
<package>org.noggit</package>
|
<package>org.noggit</package>
|
||||||
<package>org.sleuthkit.autopsy.keywordsearch</package>
|
<package>org.sleuthkit.autopsy.keywordsearch</package>
|
||||||
<package>org.slf4j</package>
|
|
||||||
<package>org.slf4j.event</package>
|
|
||||||
<package>org.slf4j.helpers</package>
|
|
||||||
<package>org.slf4j.spi</package>
|
|
||||||
</public-packages>
|
</public-packages>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/commons-digester-1.8.1.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-digester-1.8.1.jar</runtime-relative-path>
|
||||||
@ -283,6 +279,10 @@
|
|||||||
<runtime-relative-path>ext/guava-17.0.jar</runtime-relative-path>
|
<runtime-relative-path>ext/guava-17.0.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/guava-17.0.jar</binary-origin>
|
<binary-origin>release/modules/ext/guava-17.0.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/language-detector-0.6.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release\modules\ext\language-detector-0.6.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/joda-time-2.2.jar</runtime-relative-path>
|
<runtime-relative-path>ext/joda-time-2.2.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/joda-time-2.2.jar</binary-origin>
|
<binary-origin>release/modules/ext/joda-time-2.2.jar</binary-origin>
|
||||||
|
@ -45,7 +45,7 @@
|
|||||||
that avoids logging every request
|
that avoids logging every request
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<schema name="Autopsy Keyword Search" version="2.1">
|
<schema name="Autopsy Keyword Search" version="2.2">
|
||||||
<!-- attribute "name" is the name of this schema and is only used for display purposes.
|
<!-- attribute "name" is the name of this schema and is only used for display purposes.
|
||||||
Applications should change this to reflect the nature of the search collection.
|
Applications should change this to reflect the nature of the search collection.
|
||||||
version="1.4" is Solr's version number for the schema syntax and semantics. It should
|
version="1.4" is Solr's version number for the schema syntax and semantics. It should
|
||||||
@ -62,6 +62,7 @@
|
|||||||
2.0 added chunk_size field
|
2.0 added chunk_size field
|
||||||
2.1 to facilitate case insensitive regex search,no longer copying content into content_str.
|
2.1 to facilitate case insensitive regex search,no longer copying content into content_str.
|
||||||
content_str will be populated with lowercase content by Autopsy.
|
content_str will be populated with lowercase content by Autopsy.
|
||||||
|
2.2 added text_ja type, content_ja and language fields to support Japanese text search
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<types>
|
<types>
|
||||||
@ -243,6 +244,18 @@
|
|||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
|
||||||
|
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||||
|
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
|
||||||
|
<filter class="solr.CJKWidthFilterFactory"/>
|
||||||
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
|
||||||
|
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
<!-- A text field with defaults appropriate for English: it
|
<!-- A text field with defaults appropriate for English: it
|
||||||
tokenizes with StandardTokenizer, removes English stop words
|
tokenizes with StandardTokenizer, removes English stop words
|
||||||
(stopwords_en.txt), down cases, protects words from protwords.txt, and
|
(stopwords_en.txt), down cases, protects words from protwords.txt, and
|
||||||
@ -557,6 +570,11 @@
|
|||||||
via copyField further on in this schema -->
|
via copyField further on in this schema -->
|
||||||
<field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
|
<field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
|
||||||
|
|
||||||
|
<!-- Store language detection result. Only parents of text documents have this -->
|
||||||
|
<field name="language" type="string" indexed="false" stored="true" required="false"/>
|
||||||
|
|
||||||
|
<field name="content_ja" type="text_ja" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
|
||||||
|
|
||||||
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
|
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
|
||||||
leading wildcard queries. -->
|
leading wildcard queries. -->
|
||||||
<!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->
|
<!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->
|
||||||
|
@ -38,6 +38,7 @@ import org.apache.commons.lang3.math.NumberUtils;
|
|||||||
import org.apache.solr.client.solrj.SolrQuery;
|
import org.apache.solr.client.solrj.SolrQuery;
|
||||||
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
||||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||||
|
import org.apache.solr.common.SolrDocument;
|
||||||
import org.apache.solr.common.SolrDocumentList;
|
import org.apache.solr.common.SolrDocumentList;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
@ -346,6 +347,8 @@ class HighlightedText implements IndexedText {
|
|||||||
String chunkID = "";
|
String chunkID = "";
|
||||||
String highlightField = "";
|
String highlightField = "";
|
||||||
try {
|
try {
|
||||||
|
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
|
||||||
|
|
||||||
loadPageInfo(); //inits once
|
loadPageInfo(); //inits once
|
||||||
SolrQuery q = new SolrQuery();
|
SolrQuery q = new SolrQuery();
|
||||||
q.setShowDebugInfo(DEBUG); //debug
|
q.setShowDebugInfo(DEBUG); //debug
|
||||||
@ -359,22 +362,46 @@ class HighlightedText implements IndexedText {
|
|||||||
|
|
||||||
highlightField = LuceneQuery.HIGHLIGHT_FIELD;
|
highlightField = LuceneQuery.HIGHLIGHT_FIELD;
|
||||||
if (isLiteral) {
|
if (isLiteral) {
|
||||||
//if the query is literal try to get solr to do the highlighting
|
if (2.2 <= indexSchemaVersion) {
|
||||||
final String highlightQuery = keywords.stream()
|
//if the query is literal try to get solr to do the highlighting
|
||||||
.map(HighlightedText::constructEscapedSolrQuery)
|
final String highlightQuery = keywords.stream().map(s ->
|
||||||
.collect(Collectors.joining(" "));
|
LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
|
||||||
|
.collect(Collectors.joining(" OR "));
|
||||||
|
q.setQuery(highlightQuery);
|
||||||
|
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
|
||||||
|
q.addField(field.toString());
|
||||||
|
q.addHighlightField(field.toString());
|
||||||
|
}
|
||||||
|
q.addField(Server.Schema.LANGUAGE.toString());
|
||||||
|
// in case of single term literal query there is only 1 term
|
||||||
|
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
|
||||||
|
q.addFilterQuery(filterQuery);
|
||||||
|
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
|
||||||
|
} else {
|
||||||
|
//if the query is literal try to get solr to do the highlighting
|
||||||
|
final String highlightQuery = keywords.stream()
|
||||||
|
.map(HighlightedText::constructEscapedSolrQuery)
|
||||||
|
.collect(Collectors.joining(" "));
|
||||||
|
|
||||||
q.setQuery(highlightQuery);
|
q.setQuery(highlightQuery);
|
||||||
q.addField(highlightField);
|
q.addField(highlightField);
|
||||||
q.addFilterQuery(filterQuery);
|
q.addFilterQuery(filterQuery);
|
||||||
q.addHighlightField(highlightField);
|
q.addHighlightField(highlightField);
|
||||||
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
|
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
|
||||||
|
}
|
||||||
|
|
||||||
//tune the highlighter
|
//tune the highlighter
|
||||||
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
|
if (shouldUseOriginalHighlighter(contentIdStr)) {
|
||||||
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
|
// use original highlighter
|
||||||
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
|
q.setParam("hl.useFastVectorHighlighter", "off");
|
||||||
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
|
q.setParam("hl.simple.pre", HIGHLIGHT_PRE);
|
||||||
|
q.setParam("hl.simple.post", HIGHLIGHT_POST);
|
||||||
|
} else {
|
||||||
|
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
|
||||||
|
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
|
||||||
|
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
|
||||||
|
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
|
||||||
|
}
|
||||||
|
|
||||||
//docs says makes sense for the original Highlighter only, but not really
|
//docs says makes sense for the original Highlighter only, but not really
|
||||||
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
|
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
|
||||||
@ -406,12 +433,40 @@ class HighlightedText implements IndexedText {
|
|||||||
if (responseHighlightID == null) {
|
if (responseHighlightID == null) {
|
||||||
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
|
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
|
||||||
} else {
|
} else {
|
||||||
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
SolrDocument document = response.getResults().get(0);
|
||||||
if (contentHighlights == null) {
|
Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
|
||||||
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
|
if (2.2 <= indexSchemaVersion && language != null) {
|
||||||
|
List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
|
||||||
|
if (contentHighlights == null) {
|
||||||
|
highlightedContent = "";
|
||||||
|
} else {
|
||||||
|
int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
|
||||||
|
String s = contentHighlights.get(0).trim();
|
||||||
|
// If there is a mini-chunk, trim the content not to show highlighted text in it.
|
||||||
|
if (0 < hitCountInMiniChunk) {
|
||||||
|
int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
|
||||||
|
int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
|
||||||
|
s,
|
||||||
|
HIGHLIGHT_PRE,
|
||||||
|
// trim after the last hit in chunk
|
||||||
|
hitCountInChunk - hitCountInMiniChunk);
|
||||||
|
if (idx != -1) {
|
||||||
|
highlightedContent = s.substring(0, idx);
|
||||||
|
} else {
|
||||||
|
highlightedContent = s;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
highlightedContent = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// extracted content (minus highlight tags) is HTML-escaped
|
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
||||||
highlightedContent = contentHighlights.get(0).trim();
|
if (contentHighlights == null) {
|
||||||
|
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
|
||||||
|
} else {
|
||||||
|
// extracted content (minus highlight tags) is HTML-escaped
|
||||||
|
highlightedContent = contentHighlights.get(0).trim();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -551,4 +606,37 @@ class HighlightedText implements IndexedText {
|
|||||||
return buf.toString();
|
return buf.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return true if we should use original highlighter instead of FastVectorHighlighter.
|
||||||
|
*
|
||||||
|
* In the case Japanese text and phrase query, FastVectorHighlighter does not work well.
|
||||||
|
*
|
||||||
|
* Note about highlighters:
|
||||||
|
* If the query is "雨が降る" (phrase query), Solr divides it into 雨 and 降る. が is a stop word here.
|
||||||
|
* It seems that FastVector highlighter does not produce any snippet when there is a stop word between terms.
|
||||||
|
* On the other hand, original highlighter produces multiple matches, for example:
|
||||||
|
* > <em>雨</em>が<em>降っ</em>ています
|
||||||
|
* Unified highlighter (from Solr 6.4) handles the case as expected:
|
||||||
|
* > <em>雨が降っ</em>ています。
|
||||||
|
*/
|
||||||
|
private boolean shouldUseOriginalHighlighter(String contentID) throws NoOpenCoreException, KeywordSearchModuleException {
|
||||||
|
final SolrQuery q = new SolrQuery();
|
||||||
|
q.setQuery("*:*");
|
||||||
|
q.addFilterQuery(Server.Schema.ID.toString() + ":" + contentID);
|
||||||
|
q.setFields(Server.Schema.LANGUAGE.toString());
|
||||||
|
|
||||||
|
QueryResponse response = solrServer.query(q, METHOD.POST);
|
||||||
|
SolrDocumentList solrDocuments = response.getResults();
|
||||||
|
|
||||||
|
if (!solrDocuments.isEmpty()) {
|
||||||
|
SolrDocument solrDocument = solrDocuments.get(0);
|
||||||
|
if (solrDocument != null) {
|
||||||
|
Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
|
||||||
|
if (languageField != null) {
|
||||||
|
return languageField.equals("ja");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -39,7 +39,7 @@ class IndexFinder {
|
|||||||
private static final String KWS_DATA_FOLDER_NAME = "data";
|
private static final String KWS_DATA_FOLDER_NAME = "data";
|
||||||
private static final String INDEX_FOLDER_NAME = "index";
|
private static final String INDEX_FOLDER_NAME = "index";
|
||||||
private static final String CURRENT_SOLR_VERSION = "4";
|
private static final String CURRENT_SOLR_VERSION = "4";
|
||||||
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.1";
|
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.2";
|
||||||
|
|
||||||
static String getCurrentSolrVersion() {
|
static String getCurrentSolrVersion() {
|
||||||
return CURRENT_SOLR_VERSION;
|
return CURRENT_SOLR_VERSION;
|
||||||
|
@ -20,8 +20,10 @@ package org.sleuthkit.autopsy.keywordsearch;
|
|||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
@ -59,6 +61,8 @@ class Ingester {
|
|||||||
private final Server solrServer = KeywordSearch.getServer();
|
private final Server solrServer = KeywordSearch.getServer();
|
||||||
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
||||||
private static Ingester instance;
|
private static Ingester instance;
|
||||||
|
private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
|
||||||
|
= new LanguageSpecificContentIndexingHelper();
|
||||||
|
|
||||||
private Ingester() {
|
private Ingester() {
|
||||||
}
|
}
|
||||||
@ -93,7 +97,7 @@ class Ingester {
|
|||||||
* file, but the Solr server is probably fine.
|
* file, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
|
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
|
||||||
indexChunk("", file.getName().toLowerCase(), getContentFields(file));
|
indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -107,7 +111,7 @@ class Ingester {
|
|||||||
* artifact, but the Solr server is probably fine.
|
* artifact, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
||||||
indexChunk("", sourceName, getContentFields(artifact));
|
indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -143,21 +147,30 @@ class Ingester {
|
|||||||
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
|
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||||
int numChunks = 0; //unknown until chunking is done
|
int numChunks = 0; //unknown until chunking is done
|
||||||
|
|
||||||
Map<String, String> fields = getContentFields(source);
|
Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
|
||||||
//Get a reader for the content of the given source
|
//Get a reader for the content of the given source
|
||||||
try (BufferedReader reader = new BufferedReader(sourceReader)) {
|
try (BufferedReader reader = new BufferedReader(sourceReader)) {
|
||||||
Chunker chunker = new Chunker(reader);
|
Chunker chunker = new Chunker(reader);
|
||||||
for (Chunk chunk : chunker) {
|
while (chunker.hasNext()) {
|
||||||
if (context != null && context.fileIngestIsCancelled()) {
|
if (context != null && context.fileIngestIsCancelled()) {
|
||||||
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
|
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Chunk chunk = chunker.next();
|
||||||
|
Map<String, Object> fields = new HashMap<>(contentFields);
|
||||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||||
fields.put(Server.Schema.ID.toString(), chunkId);
|
fields.put(Server.Schema.ID.toString(), chunkId);
|
||||||
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
|
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
|
||||||
|
Optional<Language> language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
|
||||||
|
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
|
||||||
try {
|
try {
|
||||||
//add the chunk text to Solr index
|
//add the chunk text to Solr index
|
||||||
indexChunk(chunk.toString(), sourceName, fields);
|
indexChunk(chunk.toString(), sourceName, fields);
|
||||||
|
// add mini chunk when there's a language specific field
|
||||||
|
if (chunker.hasNext() && language.isPresent()) {
|
||||||
|
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
|
||||||
|
}
|
||||||
numChunks++;
|
numChunks++;
|
||||||
} catch (Ingester.IngesterException ingEx) {
|
} catch (Ingester.IngesterException ingEx) {
|
||||||
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
|
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
|
||||||
@ -171,12 +184,13 @@ class Ingester {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.log(Level.WARNING, "Unexpected error while indexing content from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
return false;
|
return false;
|
||||||
} finally {
|
} finally {
|
||||||
if (context != null && context.fileIngestIsCancelled()) {
|
if (context != null && context.fileIngestIsCancelled()) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
|
Map<String, Object> fields = new HashMap<>(contentFields);
|
||||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||||
//reset id field to base document id
|
//reset id field to base document id
|
||||||
@ -202,7 +216,7 @@ class Ingester {
|
|||||||
*
|
*
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
|
private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
|
||||||
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
||||||
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
||||||
// but does this really mean we don't want to index it?
|
// but does this really mean we don't want to index it?
|
||||||
|
@ -134,6 +134,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
|
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
|
||||||
boolean allResultsProcessed = false;
|
boolean allResultsProcessed = false;
|
||||||
List<KeywordHit> matches = new ArrayList<>();
|
List<KeywordHit> matches = new ArrayList<>();
|
||||||
|
LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();
|
||||||
while (!allResultsProcessed) {
|
while (!allResultsProcessed) {
|
||||||
solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
|
solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
|
||||||
QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
|
QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
|
||||||
@ -141,7 +142,18 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
// objectId_chunk -> "text" -> List of previews
|
// objectId_chunk -> "text" -> List of previews
|
||||||
Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
|
Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
|
||||||
|
|
||||||
|
if (2.2 <= indexSchemaVersion) {
|
||||||
|
languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());
|
||||||
|
}
|
||||||
|
|
||||||
for (SolrDocument resultDoc : resultList) {
|
for (SolrDocument resultDoc : resultList) {
|
||||||
|
if (2.2 <= indexSchemaVersion) {
|
||||||
|
Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());
|
||||||
|
if (language != null) {
|
||||||
|
LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
/*
|
/*
|
||||||
* for each result doc, check that the first occurence of
|
* for each result doc, check that the first occurence of
|
||||||
@ -153,6 +165,11 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
||||||
final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
|
final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
|
||||||
|
|
||||||
|
// if the document has language, it should be hit in language specific content fields. So skip here.
|
||||||
|
if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (indexSchemaVersion < 2.0) {
|
if (indexSchemaVersion < 2.0) {
|
||||||
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
|
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
|
||||||
matches.add(createKeywordtHit(highlightResponse, docId));
|
matches.add(createKeywordtHit(highlightResponse, docId));
|
||||||
@ -179,9 +196,16 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
cursorMark = nextCursorMark;
|
cursorMark = nextCursorMark;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<KeywordHit> mergedMatches;
|
||||||
|
if (2.2 <= indexSchemaVersion) {
|
||||||
|
mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);
|
||||||
|
} else {
|
||||||
|
mergedMatches = matches;
|
||||||
|
}
|
||||||
|
|
||||||
QueryResults results = new QueryResults(this);
|
QueryResults results = new QueryResults(this);
|
||||||
//in case of single term literal query there is only 1 term
|
//in case of single term literal query there is only 1 term
|
||||||
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches);
|
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
@ -262,19 +286,25 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
*
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
|
private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {
|
||||||
|
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
||||||
|
|
||||||
SolrQuery q = new SolrQuery();
|
SolrQuery q = new SolrQuery();
|
||||||
q.setShowDebugInfo(DEBUG); //debug
|
q.setShowDebugInfo(DEBUG); //debug
|
||||||
// Wrap the query string in quotes if this is a literal search term.
|
// Wrap the query string in quotes if this is a literal search term.
|
||||||
String queryStr = originalKeyword.searchTermIsLiteral()
|
String queryStr = originalKeyword.searchTermIsLiteral()
|
||||||
? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
|
? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
|
||||||
|
|
||||||
// Run the query against an optional alternative field.
|
// Run the query against an optional alternative field.
|
||||||
if (field != null) {
|
if (field != null) {
|
||||||
//use the optional field
|
//use the optional field
|
||||||
queryStr = field + ":" + queryStr;
|
queryStr = field + ":" + queryStr;
|
||||||
|
q.setQuery(queryStr);
|
||||||
|
} else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
|
||||||
|
q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(queryStr));
|
||||||
|
} else {
|
||||||
|
q.setQuery(queryStr);
|
||||||
}
|
}
|
||||||
q.setQuery(queryStr);
|
|
||||||
q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
|
q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
|
||||||
// Setting the sort order is necessary for cursor based paging to work.
|
// Setting the sort order is necessary for cursor based paging to work.
|
||||||
q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
|
q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
|
||||||
@ -283,6 +313,11 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
Server.Schema.CHUNK_SIZE.toString(),
|
Server.Schema.CHUNK_SIZE.toString(),
|
||||||
Server.Schema.CONTENT_STR.toString());
|
Server.Schema.CONTENT_STR.toString());
|
||||||
|
|
||||||
|
if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
|
||||||
|
q.addField(Server.Schema.LANGUAGE.toString());
|
||||||
|
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);
|
||||||
|
}
|
||||||
|
|
||||||
for (KeywordQueryFilter filter : filters) {
|
for (KeywordQueryFilter filter : filters) {
|
||||||
q.addFilterQuery(filter.toString());
|
q.addFilterQuery(filter.toString());
|
||||||
}
|
}
|
||||||
@ -300,8 +335,16 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
*
|
*
|
||||||
* @param q The SolrQuery to configure.
|
* @param q The SolrQuery to configure.
|
||||||
*/
|
*/
|
||||||
private static void configurwQueryForHighlighting(SolrQuery q) {
|
private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {
|
||||||
q.addHighlightField(HIGHLIGHT_FIELD);
|
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
||||||
|
if (2.2 <= indexSchemaVersion) {
|
||||||
|
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
|
||||||
|
q.addHighlightField(field.toString());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
q.addHighlightField(HIGHLIGHT_FIELD);
|
||||||
|
}
|
||||||
|
|
||||||
q.setHighlightSnippets(1);
|
q.setHighlightSnippets(1);
|
||||||
q.setHighlightFragsize(SNIPPET_LENGTH);
|
q.setHighlightFragsize(SNIPPET_LENGTH);
|
||||||
|
|
||||||
@ -404,7 +447,13 @@ class LuceneQuery implements KeywordSearchQuery {
|
|||||||
if (responseHighlightID == null) {
|
if (responseHighlightID == null) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
|
||||||
|
List<String> contentHighlights;
|
||||||
|
if (2.2 <= indexSchemaVersion) {
|
||||||
|
contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
|
||||||
|
} else {
|
||||||
|
contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
||||||
|
}
|
||||||
if (contentHighlights == null) {
|
if (contentHighlights == null) {
|
||||||
return "";
|
return "";
|
||||||
} else {
|
} else {
|
||||||
|
@ -130,6 +130,18 @@ public class Server {
|
|||||||
return "content_ws"; //NON-NLS
|
return "content_ws"; //NON-NLS
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
CONTENT_JA {
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "content_ja"; //NON-NLS
|
||||||
|
}
|
||||||
|
},
|
||||||
|
LANGUAGE {
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "language"; //NON-NLS
|
||||||
|
}
|
||||||
|
},
|
||||||
FILE_NAME {
|
FILE_NAME {
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
@ -175,7 +187,18 @@ public class Server {
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return "chunk_size"; //NON-NLS
|
return "chunk_size"; //NON-NLS
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
/**
|
||||||
|
* termfreq is a function which returns the number of times the term appears.
|
||||||
|
* This is not an actual field defined in schema.xml, but can be gotten from returned documents
|
||||||
|
* in the same way as fields.
|
||||||
|
*/
|
||||||
|
TERMFREQ {
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "termfreq"; //NON-NLS
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
|
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user