Fixing up files that were broken by an assortment of merge and merge reverts.

2025-07-06 21:00:22 +00:00 · 2019-10-07 14:12:55 -04:00 · 2019-10-07 14:12:55 -04:00 · 680ad710e1
commit 680ad710e1
parent 5864303f21
9 changed files with 232 additions and 38 deletions
--- a/KeywordSearch/ivy.xml
+++ b/KeywordSearch/ivy.xml
@ -21,6 +21,7 @@
        <dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
        <dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
        <dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
+        <dependency conf="autopsy->*" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/>
        <!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
        <dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
            <exclude module="cxf-rt-rs-client"/>
--- a/KeywordSearch/nbproject/project.properties
+++ b/KeywordSearch/nbproject/project.properties
@ -29,6 +29,7 @@ file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
 file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
 file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
 file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
+file.reference.language-detector-0.6.jar=release\\modules\\ext\\language-detector-0.6.jar
 file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
 file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
 file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar
--- a/KeywordSearch/nbproject/project.xml
+++ b/KeywordSearch/nbproject/project.xml
@ -230,10 +230,6 @@
                <package>org.codehaus.stax2.validation</package>
                <package>org.noggit</package>
                <package>org.sleuthkit.autopsy.keywordsearch</package>
-                <package>org.slf4j</package>
-                <package>org.slf4j.event</package>
-                <package>org.slf4j.helpers</package>
-                <package>org.slf4j.spi</package>
            </public-packages>
            <class-path-extension>
                <runtime-relative-path>ext/commons-digester-1.8.1.jar</runtime-relative-path>
@ -283,6 +279,10 @@
                <runtime-relative-path>ext/guava-17.0.jar</runtime-relative-path>
                <binary-origin>release/modules/ext/guava-17.0.jar</binary-origin>
            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/language-detector-0.6.jar</runtime-relative-path>
+                <binary-origin>release\modules\ext\language-detector-0.6.jar</binary-origin>
+            </class-path-extension>
            <class-path-extension>
                <runtime-relative-path>ext/joda-time-2.2.jar</runtime-relative-path>
                <binary-origin>release/modules/ext/joda-time-2.2.jar</binary-origin>
--- a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/schema.xml
+++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/schema.xml
@ -45,7 +45,7 @@
    that avoids logging every request
 -->

-<schema name="Autopsy Keyword Search" version="2.1">
+<schema name="Autopsy Keyword Search" version="2.2">
  <!-- attribute "name" is the name of this schema and is only used for display purposes.
       Applications should change this to reflect the nature of the search collection.
       version="1.4" is Solr's version number for the schema syntax and semantics.  It should
@ -62,6 +62,7 @@
       2.0 added chunk_size field
       2.1 to facilitate case insensitive regex search,no longer copying content into content_str.
           content_str will be populated with lowercase content by Autopsy.
+       2.2 added text_ja type, content_ja and language fields to support Japanese text search
     -->

  <types>
@ -243,6 +244,18 @@
      </analyzer>
    </fieldType>

+    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+      <analyzer>
+        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
+        <filter class="solr.JapaneseBaseFormFilterFactory"/>
+        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
+        <filter class="solr.CJKWidthFilterFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
+        <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
    <!-- A text field with defaults appropriate for English: it
         tokenizes with StandardTokenizer, removes English stop words
         (stopwords_en.txt), down cases, protects words from protwords.txt, and
@ -557,6 +570,11 @@
        via copyField further on in this schema  -->
   <field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>

+   <!-- Store language detection result. Only parents of text documents have this -->
+   <field name="language" type="string" indexed="false" stored="true" required="false"/>
+
+   <field name="content_ja" type="text_ja" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
+
   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
        leading wildcard queries. -->
   <!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
@ -38,6 +38,7 @@ import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest.METHOD;
 import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
@ -346,6 +347,8 @@ class HighlightedText implements IndexedText {
        String chunkID = "";
        String highlightField = "";
        try {
+            double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
+
            loadPageInfo(); //inits once
            SolrQuery q = new SolrQuery();
            q.setShowDebugInfo(DEBUG); //debug
@ -359,22 +362,46 @@ class HighlightedText implements IndexedText {

            highlightField = LuceneQuery.HIGHLIGHT_FIELD;
            if (isLiteral) {
-                //if the query is literal try to get solr to do the highlighting
-                final String highlightQuery = keywords.stream()
-                        .map(HighlightedText::constructEscapedSolrQuery)
-                        .collect(Collectors.joining(" "));
+                if (2.2 <= indexSchemaVersion) {
+                    //if the query is literal try to get solr to do the highlighting
+                    final String highlightQuery = keywords.stream().map(s ->
+                        LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
+                        .collect(Collectors.joining(" OR "));
+                    q.setQuery(highlightQuery);
+                    for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
+                        q.addField(field.toString());
+                        q.addHighlightField(field.toString());
+                    }
+                    q.addField(Server.Schema.LANGUAGE.toString());
+                    // in case of single term literal query there is only 1 term
+                    LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
+                    q.addFilterQuery(filterQuery);
+                    q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+                } else {
+                    //if the query is literal try to get solr to do the highlighting
+                    final String highlightQuery = keywords.stream()
+                            .map(HighlightedText::constructEscapedSolrQuery)
+                            .collect(Collectors.joining(" "));

-                q.setQuery(highlightQuery);
-                q.addField(highlightField);
-                q.addFilterQuery(filterQuery);
-                q.addHighlightField(highlightField);
-                q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+                    q.setQuery(highlightQuery);
+                    q.addField(highlightField);
+                    q.addFilterQuery(filterQuery);
+                    q.addHighlightField(highlightField);
+                    q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+                }

                //tune the highlighter
-                q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
-                q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
-                q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
-                q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
+                if (shouldUseOriginalHighlighter(contentIdStr)) {
+                    // use original highlighter
+                    q.setParam("hl.useFastVectorHighlighter", "off");
+                    q.setParam("hl.simple.pre", HIGHLIGHT_PRE);
+                    q.setParam("hl.simple.post", HIGHLIGHT_POST);
+                } else {
+                    q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
+                    q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
+                    q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
+                    q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
+                }

                //docs says makes sense for the original Highlighter only, but not really
                q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
@ -406,12 +433,40 @@ class HighlightedText implements IndexedText {
                if (responseHighlightID == null) {
                    highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
                } else {
-                    List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
-                    if (contentHighlights == null) {
-                        highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
+                    SolrDocument document = response.getResults().get(0);
+                    Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
+                    if (2.2 <= indexSchemaVersion && language != null) {
+                        List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
+                        if (contentHighlights == null) {
+                            highlightedContent = "";
+                        } else {
+                            int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
+                            String s = contentHighlights.get(0).trim();
+                            // If there is a mini-chunk, trim the content not to show highlighted text in it.
+                            if (0 < hitCountInMiniChunk) {
+                                int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
+                                int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
+                                    s,
+                                    HIGHLIGHT_PRE,
+                                    // trim after the last hit in chunk
+                                    hitCountInChunk - hitCountInMiniChunk);
+                                if (idx != -1) {
+                                    highlightedContent = s.substring(0, idx);
+                                } else {
+                                    highlightedContent = s;
+                                }
+                            } else {
+                                highlightedContent = s;
+                            }
+                        }
                    } else {
-                        // extracted content (minus highlight tags) is HTML-escaped
-                        highlightedContent = contentHighlights.get(0).trim();
+                        List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
+                        if (contentHighlights == null) {
+                            highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
+                        } else {
+                            // extracted content (minus highlight tags) is HTML-escaped
+                            highlightedContent = contentHighlights.get(0).trim();
+                        }
                    }
                }
            }
@ -551,4 +606,37 @@ class HighlightedText implements IndexedText {
        return buf.toString();
    }

+    /**
+     * Return true if we should use original highlighter instead of FastVectorHighlighter.
+     *
+     * In the case Japanese text and phrase query, FastVectorHighlighter does not work well.
+     *
+     * Note about highlighters:
+     *   If the query is "雨が降る" (phrase query), Solr divides it into 雨 and 降る. が is a stop word here.
+     *   It seems that FastVector highlighter does not produce any snippet when there is a stop word between terms.
+     *   On the other hand, original highlighter produces multiple matches, for example:
+     *   > <em>雨</em>が<em>降っ</em>ています
+     *   Unified highlighter (from Solr 6.4) handles the case as expected:
+     *   > <em>雨が降っ</em>ています。
+     */
+    private boolean shouldUseOriginalHighlighter(String contentID) throws NoOpenCoreException, KeywordSearchModuleException {
+        final SolrQuery q = new SolrQuery();
+        q.setQuery("*:*");
+        q.addFilterQuery(Server.Schema.ID.toString() + ":" + contentID);
+        q.setFields(Server.Schema.LANGUAGE.toString());
+
+        QueryResponse response = solrServer.query(q, METHOD.POST);
+        SolrDocumentList solrDocuments = response.getResults();
+
+        if (!solrDocuments.isEmpty()) {
+            SolrDocument solrDocument = solrDocuments.get(0);
+            if (solrDocument != null) {
+                Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
+                if (languageField != null) {
+                    return languageField.equals("ja");
+                }
+            }
+        }
+        return false;
+    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexFinder.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexFinder.java
@ -39,7 +39,7 @@ class IndexFinder {
    private static final String KWS_DATA_FOLDER_NAME = "data";
    private static final String INDEX_FOLDER_NAME = "index";
    private static final String CURRENT_SOLR_VERSION = "4";
-    private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.1";
+    private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.2";

    static String getCurrentSolrVersion() {
        return CURRENT_SOLR_VERSION;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -20,8 +20,10 @@ package org.sleuthkit.autopsy.keywordsearch;

 import java.io.BufferedReader;
 import java.io.Reader;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Optional;
 import java.util.logging.Level;
 import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.solr.client.solrj.SolrServerException;
@ -59,6 +61,8 @@ class Ingester {
    private final Server solrServer = KeywordSearch.getServer();
    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
    private static Ingester instance;
+    private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
+        = new LanguageSpecificContentIndexingHelper();

    private Ingester() {
    }
@ -93,7 +97,7 @@ class Ingester {
     *                           file, but the Solr server is probably fine.
     */
    void indexMetaDataOnly(AbstractFile file) throws IngesterException {
-        indexChunk("", file.getName().toLowerCase(), getContentFields(file));
+        indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
    }

    /**
@ -107,7 +111,7 @@ class Ingester {
     *                           artifact, but the Solr server is probably fine.
     */
    void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
-        indexChunk("", sourceName, getContentFields(artifact));
+        indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
    }

    /**
@ -143,21 +147,30 @@ class Ingester {
    < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
        int numChunks = 0; //unknown until chunking is done
        
-        Map<String, String> fields = getContentFields(source);
+        Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
        //Get a reader for the content of the given source
        try (BufferedReader reader = new BufferedReader(sourceReader)) {
            Chunker chunker = new Chunker(reader);
-            for (Chunk chunk : chunker) {
+            while (chunker.hasNext()) {
                if (context != null && context.fileIngestIsCancelled()) {
                    logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
                    return false;
                }
+
+                Chunk chunk = chunker.next();
+                Map<String, Object> fields = new HashMap<>(contentFields);
                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                fields.put(Server.Schema.ID.toString(), chunkId);
                fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
+                Optional<Language> language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
+                language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
                try {
                    //add the chunk text to Solr index
                    indexChunk(chunk.toString(), sourceName, fields);
+                    // add mini chunk when there's a language specific field
+                    if (chunker.hasNext() && language.isPresent()) {
+                        languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
+                    }
                    numChunks++;
                } catch (Ingester.IngesterException ingEx) {
                    logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
@ -171,12 +184,13 @@ class Ingester {
                return false;
            }
        } catch (Exception ex) {
-            logger.log(Level.WARNING, "Unexpected error while indexing content from " + sourceID + ": " + sourceName, ex);//NON-NLS
+            logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
            return false;
        } finally {
            if (context != null && context.fileIngestIsCancelled()) {
                return false;
            } else {
+                Map<String, Object> fields = new HashMap<>(contentFields);
                //after all chunks, index just the meta data, including the  numChunks, of the parent file
                fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
                //reset id field to base document id
@ -202,7 +216,7 @@ class Ingester {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
+    private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
            //JMTODO: actually if the we couldn't get the image id it is set to -1,
            // but does this really mean we don't want to index it?
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@ -134,6 +134,7 @@ class LuceneQuery implements KeywordSearchQuery {
        String cursorMark = CursorMarkParams.CURSOR_MARK_START;
        boolean allResultsProcessed = false;
        List<KeywordHit> matches = new ArrayList<>();
+        LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();
        while (!allResultsProcessed) {
            solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
            QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
@ -141,7 +142,18 @@ class LuceneQuery implements KeywordSearchQuery {
            // objectId_chunk -> "text" -> List of previews
            Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();

+            if (2.2 <= indexSchemaVersion) {
+                languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());
+            }
+
            for (SolrDocument resultDoc : resultList) {
+                if (2.2 <= indexSchemaVersion) {
+                    Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());
+                    if (language != null) {
+                        LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);
+                    }
+                }
+
                try {
                    /*
                     * for each result doc, check that the first occurence of
@ -153,6 +165,11 @@ class LuceneQuery implements KeywordSearchQuery {
                    final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
                    final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());

+                    // if the document has language, it should be hit in language specific content fields. So skip here.
+                    if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {
+                        continue;
+                    }
+
                    if (indexSchemaVersion < 2.0) {
                        //old schema versions don't support chunk_size or the content_str fields, so just accept hits
                        matches.add(createKeywordtHit(highlightResponse, docId));
@ -179,9 +196,16 @@ class LuceneQuery implements KeywordSearchQuery {
            cursorMark = nextCursorMark;
        }

+        List<KeywordHit> mergedMatches;
+        if (2.2 <= indexSchemaVersion) {
+            mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);
+        } else {
+            mergedMatches = matches;
+        }
+
        QueryResults results = new QueryResults(this);
        //in case of single term literal query there is only 1 term
-        results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches);
+        results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);

        return results;
    }
@ -262,19 +286,25 @@ class LuceneQuery implements KeywordSearchQuery {
     *
     * @return
     */
-    private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
+    private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {
+        double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
+
        SolrQuery q = new SolrQuery();
        q.setShowDebugInfo(DEBUG); //debug
        // Wrap the query string in quotes if this is a literal search term.
        String queryStr = originalKeyword.searchTermIsLiteral()
-                ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
+            ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;

        // Run the query against an optional alternative field. 
        if (field != null) {
            //use the optional field
            queryStr = field + ":" + queryStr;
+            q.setQuery(queryStr);
+        } else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
+            q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(queryStr));
+        } else {
+            q.setQuery(queryStr);
        }
-        q.setQuery(queryStr);
        q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
        // Setting the sort order is necessary for cursor based paging to work.
        q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
@ -283,6 +313,11 @@ class LuceneQuery implements KeywordSearchQuery {
                Server.Schema.CHUNK_SIZE.toString(),
                Server.Schema.CONTENT_STR.toString());

+        if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
+            q.addField(Server.Schema.LANGUAGE.toString());
+            LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);
+        }
+
        for (KeywordQueryFilter filter : filters) {
            q.addFilterQuery(filter.toString());
        }
@ -300,8 +335,16 @@ class LuceneQuery implements KeywordSearchQuery {
     *
     * @param q The SolrQuery to configure.
     */
-    private static void configurwQueryForHighlighting(SolrQuery q) {
-        q.addHighlightField(HIGHLIGHT_FIELD);
+    private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {
+        double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
+        if (2.2 <= indexSchemaVersion) {
+            for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
+                q.addHighlightField(field.toString());
+            }
+        } else {
+            q.addHighlightField(HIGHLIGHT_FIELD);
+        }
+
        q.setHighlightSnippets(1);
        q.setHighlightFragsize(SNIPPET_LENGTH);

@ -404,7 +447,13 @@ class LuceneQuery implements KeywordSearchQuery {
            if (responseHighlightID == null) {
                return "";
            }
-            List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
+            double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
+            List<String> contentHighlights;
+            if (2.2 <= indexSchemaVersion) {
+                contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
+            } else {
+                contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
+            }
            if (contentHighlights == null) {
                return "";
            } else {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
@ -130,6 +130,18 @@ public class Server {
                return "content_ws"; //NON-NLS
            }
        },
+        CONTENT_JA {
+            @Override
+            public String toString() {
+                return "content_ja"; //NON-NLS
+            }
+        },
+        LANGUAGE {
+            @Override
+            public String toString() {
+                return "language"; //NON-NLS
+            }
+        },
        FILE_NAME {
            @Override
            public String toString() {
@ -175,7 +187,18 @@ public class Server {
            public String toString() {
                return "chunk_size"; //NON-NLS
            }
-        }
+        },
+        /**
+         * termfreq is a function which returns the number of times the term appears.
+         * This is not an actual field defined in schema.xml, but can be gotten from returned documents
+         * in the same way as fields.
+         */
+        TERMFREQ {
+            @Override
+            public String toString() {
+                return "termfreq"; //NON-NLS
+            }
+        }        
    };

    public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)