Japanese search feature

2025-07-06 21:00:22 +00:00 · 2019-08-26 18:57:01 +09:00 · 2019-08-26 18:57:01 +09:00 · 6491c0fe52
commit 6491c0fe52
parent 478f735a4b
19 changed files with 1258 additions and 28 deletions
--- a/KeywordSearch/ivy.xml
+++ b/KeywordSearch/ivy.xml
@ -21,6 +21,7 @@
        <dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
        <dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
        <dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
+        <dependency conf="autopsy->*" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/>
        <!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
        <dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
            <exclude module="cxf-rt-rs-client"/>
--- a/KeywordSearch/nbproject/project.properties
+++ b/KeywordSearch/nbproject/project.properties
@ -29,6 +29,7 @@ file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
 file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
 file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
 file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
+file.reference.language-detector-0.6.jar=release/modules/ext/language-detector-0.6.jar
 file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
 file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
 file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar
--- a/KeywordSearch/nbproject/project.xml
+++ b/KeywordSearch/nbproject/project.xml
@ -467,6 +467,10 @@
                <runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
                <binary-origin>release/modules/ext/vorbis-java-tika-0.8.jar</binary-origin>
            </class-path-extension>
+            <class-path-extension>
+                <runtime-relative-path>ext/language-detector-0.6.jar</runtime-relative-path>
+                <binary-origin>release/modules/ext/language-detector-0.6.jar</binary-origin>
+            </class-path-extension>
        </data>
    </configuration>
 </project>
--- a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt
+++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt
@ -0,0 +1,420 @@
+#
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
+#
+# Any token with a part-of-speech tag that exactly matches those defined in this
+# file are removed from the token stream.
+#
+# Set your own stoptags by uncommenting the lines below.  Note that comments are
+# not allowed on the same line as a stoptag.  See LUCENE-3745 for frequency lists,
+# etc. that can be useful for building you own stoptag set.
+#
+# The entire possible tagset is provided below for convenience.
+#
+#####
+#  noun: unclassified nouns
+#名詞
+#
+#  noun-common: Common nouns or nouns where the sub-classification is undefined
+#名詞-一般
+#
+#  noun-proper: Proper nouns where the sub-classification is undefined 
+#名詞-固有名詞
+#
+#  noun-proper-misc: miscellaneous proper nouns
+#名詞-固有名詞-一般
+#
+#  noun-proper-person: Personal names where the sub-classification is undefined
+#名詞-固有名詞-人名
+#
+#  noun-proper-person-misc: names that cannot be divided into surname and 
+#  given name; foreign names; names where the surname or given name is unknown.
+#  e.g. お市の方
+#名詞-固有名詞-人名-一般
+#
+#  noun-proper-person-surname: Mainly Japanese surnames.
+#  e.g. 山田
+#名詞-固有名詞-人名-姓
+#
+#  noun-proper-person-given_name: Mainly Japanese given names.
+#  e.g. 太郎
+#名詞-固有名詞-人名-名
+#
+#  noun-proper-organization: Names representing organizations.
+#  e.g. 通産省, NHK
+#名詞-固有名詞-組織
+#
+#  noun-proper-place: Place names where the sub-classification is undefined
+#名詞-固有名詞-地域
+#
+#  noun-proper-place-misc: Place names excluding countries.
+#  e.g. アジア, バルセロナ, 京都
+#名詞-固有名詞-地域-一般
+#
+#  noun-proper-place-country: Country names. 
+#  e.g. 日本, オーストラリア
+#名詞-固有名詞-地域-国
+#
+#  noun-pronoun: Pronouns where the sub-classification is undefined
+#名詞-代名詞
+#
+#  noun-pronoun-misc: miscellaneous pronouns: 
+#  e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
+#名詞-代名詞-一般
+#
+#  noun-pronoun-contraction: Spoken language contraction made by combining a 
+#  pronoun and the particle 'wa'.
+#  e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ 
+#名詞-代名詞-縮約
+#
+#  noun-adverbial: Temporal nouns such as names of days or months that behave 
+#  like adverbs. Nouns that represent amount or ratios and can be used adverbially,
+#  e.g. 金曜, 一月, 午後, 少量
+#名詞-副詞可能
+#
+#  noun-verbal: Nouns that take arguments with case and can appear followed by 
+#  'suru' and related verbs (する, できる, なさる, くださる)
+#  e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
+#名詞-サ変接続
+#
+#  noun-adjective-base: The base form of adjectives, words that appear before な ("na")
+#  e.g. 健康, 安易, 駄目, だめ
+#名詞-形容動詞語幹
+#
+#  noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
+#  e.g. 0, 1, 2, 何, 数, 幾
+#名詞-数
+#
+#  noun-affix: noun affixes where the sub-classification is undefined
+#名詞-非自立
+#
+#  noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that 
+#  attach to the base form of inflectional words, words that cannot be classified 
+#  into any of the other categories below. This category includes indefinite nouns.
+#  e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, 
+#       順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, 
+#       拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
+#       わり, 割り, 割, ん-口語/, もん-口語/
+#名詞-非自立-一般
+#
+#  noun-affix-adverbial: noun affixes that that can behave as adverbs.
+#  e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, 
+#       上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, 
+#       最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, 
+#       とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, 
+#       儘, 侭, みぎり, 矢先
+#名詞-非自立-副詞可能
+#
+#  noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars 
+#  with the stem よう(だ) ("you(da)").
+#  e.g.  よう, やう, 様 (よう)
+#名詞-非自立-助動詞語幹
+#  
+#  noun-affix-adjective-base: noun affixes that can connect to the indeclinable
+#  connection form な (aux "da").
+#  e.g. みたい, ふう
+#名詞-非自立-形容動詞語幹
+#
+#  noun-special: special nouns where the sub-classification is undefined.
+#名詞-特殊
+#
+#  noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is 
+#  treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base 
+#  form of inflectional words.
+#  e.g. そう
+#名詞-特殊-助動詞語幹
+#
+#  noun-suffix: noun suffixes where the sub-classification is undefined.
+#名詞-接尾
+#
+#  noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect 
+#  to ガル or タイ and can combine into compound nouns, words that cannot be classified into
+#  any of the other categories below. In general, this category is more inclusive than 
+#  接尾語 ("suffix") and is usually the last element in a compound noun.
+#  e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (～した) さ, 次第, 済 (ず) み,
+#       よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
+#名詞-接尾-一般
+#
+#  noun-suffix-person: Suffixes that form nouns and attach to person names more often
+#  than other nouns.
+#  e.g. 君, 様, 著
+#名詞-接尾-人名
+#
+#  noun-suffix-place: Suffixes that form nouns and attach to place names more often 
+#  than other nouns.
+#  e.g. 町, 市, 県
+#名詞-接尾-地域
+#
+#  noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that 
+#  can appear before スル ("suru").
+#  e.g. 化, 視, 分け, 入り, 落ち, 買い
+#名詞-接尾-サ変接続
+#
+#  noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, 
+#  is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the 
+#  conjunctive form of inflectional words.
+#  e.g. そう
+#名詞-接尾-助動詞語幹
+#
+#  noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive 
+#  form of inflectional words and appear before the copula だ ("da").
+#  e.g. 的, げ, がち
+#名詞-接尾-形容動詞語幹
+#
+#  noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
+#  e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ)
+#名詞-接尾-副詞可能
+#
+#  noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category 
+#  is more inclusive than 助数詞 ("classifier") and includes common nouns that attach 
+#  to numbers.
+#  e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半
+#名詞-接尾-助数詞
+#
+#  noun-suffix-special: Special suffixes that mainly attach to inflecting words.
+#  e.g. (楽し) さ, (考え) 方
+#名詞-接尾-特殊
+#
+#  noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words 
+#  together.
+#  e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦)
+#名詞-接続詞的
+#
+#  noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are 
+#  semantically verb-like.
+#  e.g. ごらん, ご覧, 御覧, 頂戴
+#名詞-動詞非自立的
+#
+#  noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, 
+#  dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") 
+#  is いわく ("iwaku").
+#名詞-引用文字列
+#
+#  noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and
+#  behave like an adjective.
+#  e.g. 申し訳, 仕方, とんでも, 違い
+#名詞-ナイ形容詞語幹
+#
+#####
+#  prefix: unclassified prefixes
+#接頭詞
+#
+#  prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) 
+#  excluding numerical expressions.
+#  e.g. お (水), 某 (氏), 同 (社), 故 (～氏), 高 (品質), お (見事), ご (立派)
+#接頭詞-名詞接続
+#
+#  prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
+#  in conjunctive form followed by なる/なさる/くださる.
+#  e.g. お (読みなさい), お (座り)
+#接頭詞-動詞接続
+#
+#  prefix-adjectival: Prefixes that attach to adjectives.
+#  e.g. お (寒いですねえ), バカ (でかい)
+#接頭詞-形容詞接続
+#
+#  prefix-numerical: Prefixes that attach to numerical expressions.
+#  e.g. 約, およそ, 毎時
+#接頭詞-数接続
+#
+#####
+#  verb: unclassified verbs
+#動詞
+#
+#  verb-main:
+#動詞-自立
+#
+#  verb-auxiliary:
+#動詞-非自立
+#
+#  verb-suffix:
+#動詞-接尾
+#
+#####
+#  adjective: unclassified adjectives
+#形容詞
+#
+#  adjective-main:
+#形容詞-自立
+#
+#  adjective-auxiliary:
+#形容詞-非自立
+#
+#  adjective-suffix:
+#形容詞-接尾
+#
+#####
+#  adverb: unclassified adverbs
+#副詞
+#
+#  adverb-misc: Words that can be segmented into one unit and where adnominal 
+#  modification is not possible.
+#  e.g. あいかわらず, 多分
+#副詞-一般
+#
+#  adverb-particle_conjunction: Adverbs that can be followed by の, は, に, 
+#  な, する, だ, etc.
+#  e.g. こんなに, そんなに, あんなに, なにか, なんでも
+#副詞-助詞類接続
+#
+#####
+#  adnominal: Words that only have noun-modifying forms.
+#  e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, 
+#       どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, 
+#       「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き
+#連体詞
+#
+#####
+#  conjunction: Conjunctions that can occur independently.
+#  e.g. が, けれども, そして, じゃあ, それどころか
+接続詞
+#
+#####
+#  particle: unclassified particles.
+助詞
+#
+#  particle-case: case particles where the subclassification is undefined.
+助詞-格助詞
+#
+#  particle-case-misc: Case particles.
+#  e.g. から, が, で, と, に, へ, より, を, の, にて
+助詞-格助詞-一般
+#
+#  particle-case-quote: the "to" that appears after nouns, a person’s speech, 
+#  quotation marks, expressions of decisions from a meeting, reasons, judgements,
+#  conjectures, etc.
+#  e.g. ( だ) と (述べた.), ( である) と (して執行猶予...)
+助詞-格助詞-引用
+#
+#  particle-case-compound: Compounds of particles and verbs that mainly behave 
+#  like case particles.
+#  e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って,
+#       にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, 
+#       にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, 
+#       に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, 
+#       に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって,
+#       にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, 
+#       にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる,
+#       って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ
+助詞-格助詞-連語
+#
+#  particle-conjunctive:
+#  e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, 
+#       ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, 
+#       (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/
+助詞-接続助詞
+#
+#  particle-dependency:
+#  e.g. こそ, さえ, しか, すら, は, も, ぞ
+助詞-係助詞
+#
+#  particle-adverbial:
+#  e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, 
+#       (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/,
+#       (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, 
+#       (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/,
+#       ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」)
+助詞-副助詞
+#
+#  particle-interjective: particles with interjective grammatical roles.
+#  e.g. (松島) や
+助詞-間投助詞
+#
+#  particle-coordinate:
+#  e.g. と, たり, だの, だり, とか, なり, や, やら
+助詞-並立助詞
+#
+#  particle-final:
+#  e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, 
+#       ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/
+助詞-終助詞
+#
+#  particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is 
+#  adverbial, conjunctive, or sentence final. For example:
+#       (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」
+#       (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」
+#           「(祈りが届いたせい) か (, 試験に合格した.)」
+#       (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」
+#  e.g. か
+助詞-副助詞／並立助詞／終助詞
+#
+#  particle-adnominalizer: The "no" that attaches to nouns and modifies 
+#  non-inflectional words.
+助詞-連体化
+#
+#  particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs 
+#  that are giongo, giseigo, or gitaigo.
+#  e.g. に, と
+助詞-副詞化
+#
+#  particle-special: A particle that does not fit into one of the above classifications. 
+#  This includes particles that are used in Tanka, Haiku, and other poetry.
+#  e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家)
+助詞-特殊
+#
+#####
+#  auxiliary-verb:
+助動詞
+#
+#####
+#  interjection: Greetings and other exclamations.
+#  e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, 
+#       いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
+#感動詞
+#
+#####
+#  symbol: unclassified Symbols.
+記号
+#
+#  symbol-misc: A general symbol not in one of the categories below.
+#  e.g. [○◎@$〒→+]
+記号-一般
+#
+#  symbol-comma: Commas
+#  e.g. [,、]
+記号-読点
+#
+#  symbol-period: Periods and full stops.
+#  e.g. [.．。]
+記号-句点
+#
+#  symbol-space: Full-width whitespace.
+記号-空白
+#
+#  symbol-open_bracket:
+#  e.g. [({‘“『【]
+記号-括弧開
+#
+#  symbol-close_bracket:
+#  e.g. [)}’”』」】]
+記号-括弧閉
+#
+#  symbol-alphabetic:
+#記号-アルファベット
+#
+#####
+#  other: unclassified other
+#その他
+#
+#  other-interjection: Words that are hard to classify as noun-suffixes or 
+#  sentence-final particles.
+#  e.g. (だ)ァ
+その他-間投
+#
+#####
+#  filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
+#  e.g. あの, うんと, えと
+フィラー
+#
+#####
+#  non-verbal: non-verbal sound.
+非言語音
+#
+#####
+#  fragment:
+#語断片
+#
+#####
+#  unknown: unknown part of speech.
+#未知語
+#
+##### End of file
--- a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt
+++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt
@ -0,0 +1,127 @@
+#
+# This file defines a stopword set for Japanese.
+#
+# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
+# Punctuation characters and frequent kanji have mostly been left out.  See LUCENE-3745
+# for frequency lists, etc. that can be useful for making your own set (if desired)
+#
+# Note that there is an overlap between these stopwords and the terms stopped when used
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
+# that comments are not allowed on the same line as stopwords.
+#
+# Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
+# configuration if you need case-sensitive stopping.  Lastly, note that stopping is done
+# using the same character width as the entries in this file.  Since this StopFilter is
+# normally done after a CJKWidthFilter in your chain, you would usually want your romaji
+# entries to be in half-width and your kana entries to be in full-width.
+#
+の
+に
+は
+を
+た
+が
+で
+て
+と
+し
+れ
+さ
+ある
+いる
+も
+する
+から
+な
+こと
+として
+い
+や
+れる
+など
+なっ
+ない
+この
+ため
+その
+あっ
+よう
+また
+もの
+という
+あり
+まで
+られ
+なる
+へ
+か
+だ
+これ
+によって
+により
+おり
+より
+による
+ず
+なり
+られる
+において
+ば
+なかっ
+なく
+しかし
+について
+せ
+だっ
+その後
+できる
+それ
+う
+ので
+なお
+のみ
+でき
+き
+つ
+における
+および
+いう
+さらに
+でも
+ら
+たり
+その他
+に関する
+たち
+ます
+ん
+なら
+に対して
+特に
+せる
+及び
+これら
+とき
+では
+にて
+ほか
+ながら
+うち
+そして
+とともに
+ただし
+かつて
+それぞれ
+または
+お
+ほど
+ものの
+に対する
+ほとんど
+と共に
+といった
+です
+とも
+ところ
+ここ
+##### End of file
--- a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/schema.xml
+++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/schema.xml
@ -243,6 +243,18 @@
      </analyzer>
    </fieldType>

+    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+      <analyzer>
+        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
+        <filter class="solr.JapaneseBaseFormFilterFactory"/>
+        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
+        <filter class="solr.CJKWidthFilterFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
+        <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
    <!-- A text field with defaults appropriate for English: it
         tokenizes with StandardTokenizer, removes English stop words
         (stopwords_en.txt), down cases, protects words from protwords.txt, and
@ -557,6 +569,11 @@
        via copyField further on in this schema  -->
   <field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>

+   <!-- Store language detection result. Only parents of text documents have this -->
+   <field name="language" type="string" indexed="false" stored="true" required="false"/>
+
+   <field name="content_ja" type="text_ja" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
+
   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
        leading wildcard queries. -->
   <!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java
@ -38,6 +38,7 @@ import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest.METHOD;
 import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.coreutils.Logger;
@ -346,6 +347,8 @@ class HighlightedText implements IndexedText {
        String chunkID = "";
        String highlightField = "";
        try {
+            double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
+
            loadPageInfo(); //inits once
            SolrQuery q = new SolrQuery();
            q.setShowDebugInfo(DEBUG); //debug
@ -359,16 +362,33 @@ class HighlightedText implements IndexedText {

            highlightField = LuceneQuery.HIGHLIGHT_FIELD;
            if (isLiteral) {
-                //if the query is literal try to get solr to do the highlighting
-                final String highlightQuery = keywords.stream()
-                        .map(HighlightedText::constructEscapedSolrQuery)
-                        .collect(Collectors.joining(" "));
+                if (2.2 <= indexSchemaVersion) {
+                    //if the query is literal try to get solr to do the highlighting
+                    final String highlightQuery = keywords.stream().map(s ->
+                        LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.escapeLuceneQuery(s)))
+                        .collect(Collectors.joining(" OR "));
+                    q.setQuery(highlightQuery);
+                    for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
+                        q.addField(field.toString());
+                        q.addHighlightField(field.toString());
+                    }
+                    q.addField(Server.Schema.LANGUAGE.toString());
+                    // in case of single term literal query there is only 1 term
+                    LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
+                    q.addFilterQuery(filterQuery);
+                    q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+                } else {
+                    //if the query is literal try to get solr to do the highlighting
+                    final String highlightQuery = keywords.stream()
+                            .map(HighlightedText::constructEscapedSolrQuery)
+                            .collect(Collectors.joining(" "));

-                q.setQuery(highlightQuery);
-                q.addField(highlightField);
-                q.addFilterQuery(filterQuery);
-                q.addHighlightField(highlightField);
-                q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+                    q.setQuery(highlightQuery);
+                    q.addField(highlightField);
+                    q.addFilterQuery(filterQuery);
+                    q.addHighlightField(highlightField);
+                    q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
+                }

                //tune the highlighter
                q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
@ -406,12 +426,40 @@ class HighlightedText implements IndexedText {
                if (responseHighlightID == null) {
                    highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
                } else {
-                    List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
-                    if (contentHighlights == null) {
-                        highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
+                    SolrDocument document = response.getResults().get(0);
+                    Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
+                    if (2.2 <= indexSchemaVersion && language != null) {
+                        List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
+                        if (contentHighlights == null) {
+                            highlightedContent = "";
+                        } else {
+                            int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunks.getChunkIdString(contentIdStr));
+                            String s = contentHighlights.get(0).trim();
+                            // If there is a mini-chunk, trim the content not to show highlighted text in it.
+                            if (0 < hitCountInMiniChunk) {
+                                int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
+                                int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
+                                    s,
+                                    HIGHLIGHT_PRE,
+                                    // trim after the last hit in chunk
+                                    hitCountInChunk - hitCountInMiniChunk);
+                                if (idx != -1) {
+                                    highlightedContent = s.substring(0, idx);
+                                } else {
+                                    highlightedContent = s;
+                                }
+                            } else {
+                                highlightedContent = s;
+                            }
+                        }
                    } else {
-                        // extracted content (minus highlight tags) is HTML-escaped
-                        highlightedContent = contentHighlights.get(0).trim();
+                        List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
+                        if (contentHighlights == null) {
+                            highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
+                        } else {
+                            // extracted content (minus highlight tags) is HTML-escaped
+                            highlightedContent = contentHighlights.get(0).trim();
+                        }
                    }
                }
            }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexFinder.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexFinder.java
@ -39,7 +39,7 @@ class IndexFinder {
    private static final String KWS_DATA_FOLDER_NAME = "data";
    private static final String INDEX_FOLDER_NAME = "index";
    private static final String CURRENT_SOLR_VERSION = "4";
-    private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.1";
+    private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.2";

    static String getCurrentSolrVersion() {
        return CURRENT_SOLR_VERSION;
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -20,8 +20,10 @@ package org.sleuthkit.autopsy.keywordsearch;

 import java.io.BufferedReader;
 import java.io.Reader;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Optional;
 import java.util.logging.Level;
 import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.solr.client.solrj.SolrServerException;
@ -59,6 +61,8 @@ class Ingester {
    private final Server solrServer = KeywordSearch.getServer();
    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
    private static Ingester instance;
+    private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
+        = new LanguageSpecificContentIndexingHelper();

    private Ingester() {
    }
@ -93,7 +97,7 @@ class Ingester {
     *                           file, but the Solr server is probably fine.
     */
    void indexMetaDataOnly(AbstractFile file) throws IngesterException {
-        indexChunk("", file.getName().toLowerCase(), getContentFields(file));
+        indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
    }

    /**
@ -107,7 +111,7 @@ class Ingester {
     *                           artifact, but the Solr server is probably fine.
     */
    void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
-        indexChunk("", sourceName, getContentFields(artifact));
+        indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
    }

    /**
@ -143,21 +147,30 @@ class Ingester {
    < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
        int numChunks = 0; //unknown until chunking is done
        
-        Map<String, String> fields = getContentFields(source);
+        Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
        //Get a reader for the content of the given source
        try (BufferedReader reader = new BufferedReader(sourceReader)) {
            Chunker chunker = new Chunker(reader);
-            for (Chunk chunk : chunker) {
+            while (chunker.hasNext()) {
+                Chunk chunk = chunker.next();
+
                if (context != null && context.fileIngestIsCancelled()) {
                    logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
                    return false;
                }
+                Map<String, Object> fields = new HashMap<>(contentFields);
                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
                fields.put(Server.Schema.ID.toString(), chunkId);
                fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
+                Optional<LanguageSpecificContentIndexingHelper.Language> language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
+                language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
                try {
                    //add the chunk text to Solr index
                    indexChunk(chunk.toString(), sourceName, fields);
+                    // add mini chunk when there's a language specific field
+                    if (chunker.hasNext() && language.isPresent()) {
+                        languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
+                    }
                    numChunks++;
                } catch (Ingester.IngesterException ingEx) {
                    logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
@ -177,6 +190,7 @@ class Ingester {
            if (context != null && context.fileIngestIsCancelled()) {
                return false;
            } else {
+                Map<String, Object> fields = new HashMap<>(contentFields);
                //after all chunks, index just the meta data, including the  numChunks, of the parent file
                fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
                //reset id field to base document id
@ -202,7 +216,7 @@ class Ingester {
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
-    private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
+    private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
            //JMTODO: actually if the we couldn't get the image id it is set to -1,
            // but does this really mean we don't want to index it?
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java
@ -0,0 +1,55 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObject;
+import com.optimaize.langdetect.text.TextObjectFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+
+class LanguageDetector {
+
+  public enum Language {
+    JAPANESE,
+    ENGLISH,
+  }
+
+  private List<LanguageProfile> languageProfiles;
+
+  LanguageDetector() {
+    try {
+      languageProfiles = Arrays.asList(
+          new LanguageProfileReader().readBuiltIn(LdLocale.fromString("en")),
+          new LanguageProfileReader().readBuiltIn(LdLocale.fromString("ja"))
+      );
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  Optional<Language> detect(String text) {
+    com.optimaize.langdetect.LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+        .withProfiles(languageProfiles)
+        .build();
+
+    TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+
+    TextObject textObject = textObjectFactory.forText(text);
+    return languageDetector.detect(textObject).transform(Optional::of).or(Optional.empty()).map(LdLocale::getLanguage).flatMap(LanguageDetector::toLanguage);
+  }
+
+  private static Optional<Language> toLanguage(String s) {
+    switch (s) {
+      case "ja": return Optional.of(Language.JAPANESE);
+      case "en": return Optional.of(Language.ENGLISH);
+      default: return Optional.empty();
+    }
+  }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java
@ -0,0 +1,90 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.apache.commons.lang3.math.NumberUtils;
+import org.apache.solr.common.SolrInputDocument;
+import org.openide.util.NbBundle;
+import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
+import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+class LanguageSpecificContentIndexingHelper {
+
+  enum Language {
+    JAPANESE
+  }
+
+  private final LanguageDetector languageDetector = new LanguageDetector();
+
+  Optional<Language> detectLanguageIfNeeded(Chunker.Chunk chunk) throws NoOpenCoreException {
+    double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
+    if (2.2 <= indexSchemaVersion) {
+      return languageDetector.detect(chunk.toString()).flatMap(lang -> Optional.ofNullable(toLanguage(lang)));
+    } else {
+      return Optional.empty();
+    }
+  }
+
+  void updateLanguageSpecificFields(Map<String, Object> fields, Chunker.Chunk chunk, Language language) {
+    List<String> values = new ArrayList<>();
+    values.add(chunk.toString());
+    if (fields.containsKey(Server.Schema.FILE_NAME.toString())) {
+      values.add(fields.get(Server.Schema.FILE_NAME.toString()).toString());
+    }
+
+    // index the chunk to a language specific field
+    fields.put(Server.Schema.CONTENT_JA.toString(), values);
+    fields.put(Server.Schema.LANGUAGE.toString(), toFieldValue(language));
+  }
+
+  void indexMiniChunk(Chunker.Chunk chunk, String sourceName, Map<String, Object> fields, String baseChunkID, Language language)
+      throws Ingester.IngesterException {
+    //Make a SolrInputDocument out of the field map
+    SolrInputDocument updateDoc = new SolrInputDocument();
+    for (String key : fields.keySet()) {
+      updateDoc.addField(key, fields.get(key));
+    }
+
+    try {
+      updateDoc.setField(Server.Schema.ID.toString(), MiniChunks.getChunkIdString(baseChunkID));
+
+      // index the chunk to a language specific field
+      updateDoc.addField(Server.Schema.CONTENT_JA.toString(), chunk.toString().substring(chunk.getBaseChunkLength()));
+      updateDoc.addField(Server.Schema.LANGUAGE.toString(), toFieldValue(language));
+
+      TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
+
+      KeywordSearch.getServer().addDocument(updateDoc);
+      HealthMonitor.submitTimingMetric(metric);
+
+    } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
+      throw new Ingester.IngesterException(
+          NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
+    }
+  }
+
+  private static String toFieldValue(Language language) {
+    if (language == null) {
+      return null;
+    }
+    switch (language) {
+      case JAPANESE: return "ja";
+      default:
+        throw new IllegalStateException("Unknown language: " + language);
+    }
+  }
+
+  private Language toLanguage(LanguageDetector.Language language) {
+    if (language == null) {
+      return null;
+    }
+    switch (language) {
+      case JAPANESE: return Language.JAPANESE;
+      default:
+        return null;
+    }
+  }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java
@ -0,0 +1,218 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.sleuthkit.autopsy.coreutils.EscapeUtil;
+import org.sleuthkit.autopsy.coreutils.Version;
+import org.sleuthkit.datamodel.TskException;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+class LanguageSpecificContentQueryHelper {
+
+  private static final List<Server.Schema> QUERY_FIELDS = new ArrayList<>();
+  private static final List<Server.Schema> LANGUAGE_SPECIFIC_CONTENT_FIELDS
+      = Collections.singletonList(Server.Schema.CONTENT_JA);
+  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
+
+  static {
+    QUERY_FIELDS.add(Server.Schema.TEXT);
+    QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS);
+  }
+
+  static class QueryResults {
+    List<SolrDocument> chunks = new ArrayList<>();
+    Map</* ID */ String, SolrDocument> miniChunks = new HashMap<>();
+    // objectId_chunk -> "text" -> List of previews
+    Map<String, Map<String, List<String>>> highlighting = new HashMap<>();
+  }
+
+  /**
+   * Make a query string from the given one by applying it to the multiple query fields
+   * @param queryStr escaped query string
+   * @return query string
+   */
+  static String expandQueryString(final String queryStr) {
+    List<String> fieldQueries = new ArrayList<>();
+    fieldQueries.add(Server.Schema.TEXT.toString() + ":" + KeywordSearchUtil.quoteQuery(queryStr));
+    fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + ":" + queryStr).collect(Collectors.toList()));
+    return String.join(" OR ", fieldQueries);
+  }
+
+  static List<Server.Schema> getQueryFields() {
+    return QUERY_FIELDS;
+  }
+
+  static void updateQueryResults(QueryResults results, SolrDocument document) {
+    String id = (String) document.getFieldValue(Server.Schema.ID.toString());
+    if (MiniChunks.isMiniChunkID(id)) {
+      results.miniChunks.put(MiniChunks.getBaseChunkID(id), document);
+    } else {
+      results.chunks.add(document);
+    }
+  }
+
+  /**
+   * Get snippets
+   *
+   * @param highlight field ID -> snippets
+   * @return snippets of appropriate fields.
+   *   Note that this method returns {@code Optional.empty} if the result is empty for convenience to interact with the existing code.
+   */
+  static Optional<List<String>> getHighlights(Map<String, List<String>> highlight) {
+    for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
+      if (highlight.containsKey(field.toString())) {
+        return Optional.of(highlight.get(field.toString()));
+      }
+    }
+    return Optional.empty();
+  }
+
+  /**
+   * Merge KeywordHits from TEXT field and a language specific field
+   *
+   * Replace KeywordHits in the given {@code matches} if its chunk ID is same.
+   */
+  static List<KeywordHit> mergeKeywordHits(List<KeywordHit> matches, Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
+    Map<String, KeywordHit> map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x));
+    List<KeywordHit> merged = new ArrayList<>();
+
+    // first, replace KeywordHit in matches
+    for (KeywordHit match : matches) {
+      String key = match.getSolrDocumentId();
+      if (map.containsKey(key)) {
+        merged.add(map.get(key));
+        map.remove(key);
+      } else {
+        merged.add(match);
+      }
+    }
+    // second, add rest of KeywordHits from queryResults
+    merged.addAll(map.values());
+
+    return merged;
+  }
+
+  static void configureTermfreqQuery(SolrQuery query, String keyword) throws KeywordSearchModuleException, NoOpenCoreException {
+    // make a request to Solr to parse query.
+    QueryParser.Result queryParserResult = QueryParser.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS);
+    query.addField(buildTermfreqQuery(keyword, queryParserResult));
+  }
+
+  static String buildTermfreqQuery(String keyword, QueryParser.Result result) {
+    List<String> termfreqs = new ArrayList<>();
+    for (Map.Entry<String, List<String>> e : result.fieldTermsMap.entrySet()) {
+      String field = e.getKey();
+      for (String term : e.getValue()) {
+        termfreqs.add(String.format("termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term)));
+      }
+    }
+
+    // sum of all language specific query fields.
+    // only one of these fields could be non-zero.
+    return String.format("termfreq:sum(%s)", String.join(",", termfreqs));
+  }
+
+  static int queryChunkTermfreq(Set<String> keywords, String contentID) throws KeywordSearchModuleException, NoOpenCoreException {
+    SolrQuery q = new SolrQuery();
+    q.setShowDebugInfo(DEBUG);
+
+    final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentID);
+    final String highlightQuery = keywords.stream()
+        .map(s -> LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.escapeLuceneQuery(s)))
+        .collect(Collectors.joining(" "));
+
+    q.addFilterQuery(filterQuery);
+    q.setQuery(highlightQuery);
+    LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
+
+    QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
+    SolrDocumentList results = response.getResults();
+    if (results.isEmpty()) {
+      return 0;
+    }
+
+    SolrDocument document = results.get(0);
+    return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
+  }
+
+  static int findNthIndexOf(String s, String pattern, int n) {
+    int found = 0;
+    int idx = -1;
+    int len = s.length();
+    while (idx < len && found <= n) {
+      idx = s.indexOf(pattern, idx + 1);
+      if (idx == -1) {
+        break;
+      }
+      found++;
+    }
+
+    return idx;
+  }
+
+  private static List<KeywordHit> findMatches(Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
+    List<KeywordHit> matches = new ArrayList<>();
+    for (SolrDocument document : queryResults.chunks) {
+      String docId = (String) document.getFieldValue(Server.Schema.ID.toString());
+
+      try {
+        int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
+        SolrDocument miniChunk = queryResults.miniChunks.get(docId);
+        if (miniChunk == null) {
+          // last chunk does not have mini chunk because there's no overlapped region with next one
+          matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
+        } else {
+          int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
+          if (hitCountInMiniChunk < hitCountInChunk) {
+            // there are at least one hit in base chunk
+            matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
+          }
+        }
+      } catch (TskException ex) {
+        throw new KeywordSearchModuleException(ex);
+      }
+    }
+    return matches;
+  }
+
+  /** copied from LuceneQuery and modified to use getHighlightFieldValue */
+  private static KeywordHit createKeywordHit(Keyword originalKeyword, Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {
+    /**
+     * Get the first snippet from the document if keyword search is
+     * configured to use snippets.
+     */
+    String snippet = "";
+    if (KeywordSearchSettings.getShowSnippets()) {
+      List<String> snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null);
+      // list is null if there wasn't a snippet
+      if (snippetList != null) {
+        snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
+      }
+    }
+
+    return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());
+  }
+
+  /**
+   * @return Optional.empty if empty
+   */
+  private static Optional<List<String>> getHighlightFieldValue(Map<String, List<String>> highlight) {
+    for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
+      if (highlight.containsKey(field.toString())) {
+        return Optional.of(highlight.get(field.toString()));
+      }
+    }
+    return Optional.empty();
+  }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@ -134,6 +134,7 @@ class LuceneQuery implements KeywordSearchQuery {
        String cursorMark = CursorMarkParams.CURSOR_MARK_START;
        boolean allResultsProcessed = false;
        List<KeywordHit> matches = new ArrayList<>();
+        LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();
        while (!allResultsProcessed) {
            solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
            QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
@ -141,7 +142,18 @@ class LuceneQuery implements KeywordSearchQuery {
            // objectId_chunk -> "text" -> List of previews
            Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();

+            if (2.2 <= indexSchemaVersion) {
+                languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());
+            }
+
            for (SolrDocument resultDoc : resultList) {
+                if (2.2 <= indexSchemaVersion) {
+                    Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());
+                    if (language != null) {
+                        LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);
+                    }
+                }
+
                try {
                    /*
                     * for each result doc, check that the first occurence of
@ -153,6 +165,11 @@ class LuceneQuery implements KeywordSearchQuery {
                    final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
                    final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());

+                    // if the document has language, it should be hit in language specific content fields. So skip here.
+                    if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {
+                        continue;
+                    }
+
                    if (indexSchemaVersion < 2.0) {
                        //old schema versions don't support chunk_size or the content_str fields, so just accept hits
                        matches.add(createKeywordtHit(highlightResponse, docId));
@ -179,9 +196,16 @@ class LuceneQuery implements KeywordSearchQuery {
            cursorMark = nextCursorMark;
        }

+        List<KeywordHit> mergedMatches;
+        if (2.2 <= indexSchemaVersion) {
+            mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);
+        } else {
+            mergedMatches = matches;
+        }
+
        QueryResults results = new QueryResults(this);
        //in case of single term literal query there is only 1 term
-        results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches);
+        results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);

        return results;
    }
@ -262,19 +286,25 @@ class LuceneQuery implements KeywordSearchQuery {
     *
     * @return
     */
-    private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
+    private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {
+        double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
+
        SolrQuery q = new SolrQuery();
        q.setShowDebugInfo(DEBUG); //debug
        // Wrap the query string in quotes if this is a literal search term.
        String queryStr = originalKeyword.searchTermIsLiteral()
-                ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
+            ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;

        // Run the query against an optional alternative field. 
        if (field != null) {
            //use the optional field
            queryStr = field + ":" + queryStr;
+            q.setQuery(queryStr);
+        } else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
+            q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(keywordStringEscaped));
+        } else {
+            q.setQuery(queryStr);
        }
-        q.setQuery(queryStr);
        q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
        // Setting the sort order is necessary for cursor based paging to work.
        q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
@ -283,6 +313,11 @@ class LuceneQuery implements KeywordSearchQuery {
                Server.Schema.CHUNK_SIZE.toString(),
                Server.Schema.CONTENT_STR.toString());

+        if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
+            q.addField(Server.Schema.LANGUAGE.toString());
+            LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);
+        }
+
        for (KeywordQueryFilter filter : filters) {
            q.addFilterQuery(filter.toString());
        }
@ -300,8 +335,16 @@ class LuceneQuery implements KeywordSearchQuery {
     *
     * @param q The SolrQuery to configure.
     */
-    private static void configurwQueryForHighlighting(SolrQuery q) {
-        q.addHighlightField(HIGHLIGHT_FIELD);
+    private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {
+        double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
+        if (2.2 <= indexSchemaVersion) {
+            for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
+                q.addHighlightField(field.toString());
+            }
+        } else {
+            q.addHighlightField(HIGHLIGHT_FIELD);
+        }
+
        q.setHighlightSnippets(1);
        q.setHighlightFragsize(SNIPPET_LENGTH);

@ -404,7 +447,13 @@ class LuceneQuery implements KeywordSearchQuery {
            if (responseHighlightID == null) {
                return "";
            }
-            List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
+            double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
+            List<String> contentHighlights;
+            if (2.2 <= indexSchemaVersion) {
+                contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
+            } else {
+                contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
+            }
            if (contentHighlights == null) {
                return "";
            } else {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunks.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunks.java
@ -0,0 +1,18 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+class MiniChunks {
+
+  static String SUFFIX = "_mini";
+
+  static String getChunkIdString(String baseChunkID) {
+    return baseChunkID + SUFFIX;
+  }
+
+  static boolean isMiniChunkID(String chunkID) {
+    return chunkID.endsWith(SUFFIX);
+  }
+
+  static String getBaseChunkID(String miniChunkID) {
+    return miniChunkID.replaceFirst(SUFFIX + "$", "");
+  }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryParser.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryParser.java
@ -0,0 +1,63 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * Parse query using Solr
+ */
+class QueryParser {
+
+  static class Result {
+    /**
+     * field name -> [term]
+     */
+    Map<String, List<String>> fieldTermsMap;
+  }
+
+  /**
+   * Parse the given query string on Solr and return the result
+   */
+  static Result parse(String query, List<Server.Schema> fields) throws KeywordSearchModuleException, NoOpenCoreException {
+    SolrQuery q = new SolrQuery();
+    q.setShowDebugInfo(true);
+    q.setQuery(fields.stream().map(f -> String.format("%s:%s", f, KeywordSearchUtil.escapeLuceneQuery(query))).collect(Collectors.joining(" OR ")));
+    q.setRows(0);
+
+    QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
+    Map<String, Object> debugMap = response.getDebugMap();
+    String parsedQuery = debugMap.getOrDefault("parsedquery", "").toString();
+
+    Result result = new Result();
+    result.fieldTermsMap = getFieldTermsMap(parsedQuery);
+    return result;
+  }
+
+  static Map<String, List<String>> getFieldTermsMap(String parsedQuery) {
+    Map<String, List<String>> map = new HashMap<>();
+
+    for (String fieldTermStr : parsedQuery.split(" ")) {
+      String[] fieldTerm = fieldTermStr.split(":");
+      if (fieldTerm.length != 2) {
+        continue;
+      }
+
+      String field = fieldTerm[0];
+      String term = fieldTerm[1];
+
+      List<String> terms = map.getOrDefault(field, new ArrayList<>());
+      terms.add(term);
+
+      map.put(field, terms);
+    }
+
+    return map;
+  }
+}
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
@ -39,6 +39,7 @@ import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Random;
@ -130,6 +131,18 @@ public class Server {
                return "content_ws"; //NON-NLS
            }
        },
+        CONTENT_JA {
+            @Override
+            public String toString() {
+                return "content_ja"; //NON-NLS
+            }
+        },
+        LANGUAGE {
+            @Override
+            public String toString() {
+                return "language"; //NON-NLS
+            }
+        },
        FILE_NAME {
            @Override
            public String toString() {
@ -175,6 +188,12 @@ public class Server {
            public String toString() {
                return "chunk_size"; //NON-NLS
            }
+        },
+        TERMFREQ {
+            @Override
+            public String toString() {
+                return "termfreq"; //NON-NLS
+            }
        }
    };

@ -1635,7 +1654,8 @@ public class Server {
        private int queryNumFileChunks(long contentID) throws SolrServerException, IOException {
            String id = KeywordSearchUtil.escapeLuceneQuery(Long.toString(contentID));
            final SolrQuery q
-                    = new SolrQuery(Server.Schema.ID + ":" + id + Server.CHUNK_ID_SEPARATOR + "*");
+                    = new SolrQuery(Server.Schema.ID + ":" + id + Server.CHUNK_ID_SEPARATOR + "*"
+                        + " NOT " + Server.Schema.ID + ":*" + MiniChunks.SUFFIX);
            q.setRows(0);
            return (int) query(q).getResults().getNumFound();
        }
--- a/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java
+++ b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java
@ -0,0 +1,40 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.HashMap;
+
+import static org.junit.Assert.assertEquals;
+
+public class LanguageSpecificContentQueryHelperTest {
+
+  @Test
+  public void makeQueryString() {
+    assertEquals("text:query OR content_ja:query", LanguageSpecificContentQueryHelper.expandQueryString("query"));
+  }
+
+  @Test
+  public void findNthIndexOf() {
+    assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "_", 0));
+    assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 0));
+    assertEquals(2, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 1));
+    assertEquals(3, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 2));
+    assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 3));
+    assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "", 0));
+    assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("", "A", 0));
+    assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", -1));
+    assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 999));
+  }
+
+  @Test
+  public void buildTermfreqQuery() {
+    QueryParser.Result result = new QueryParser.Result();
+    result.fieldTermsMap = new HashMap<>();
+    result.fieldTermsMap.put("field1", Arrays.asList("term1"));
+    result.fieldTermsMap.put("field2", Arrays.asList("term1", "term2"));
+    assertEquals(
+        "termfreq:sum(termfreq(\"field1\",\"term1\"),termfreq(\"field1\",\"term1\"),termfreq(\"field1\",\"term1\"))",
+        LanguageSpecificContentQueryHelper.buildTermfreqQuery("query", result));
+  }
+}
--- a/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunksTest.java
+++ b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunksTest.java
@ -0,0 +1,25 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class MiniChunksTest {
+
+  @Test
+  public void isMiniChunkID() {
+    assertTrue(MiniChunks.isMiniChunkID("1_1_mini"));
+    assertFalse(MiniChunks.isMiniChunkID("1_1"));
+    assertFalse(MiniChunks.isMiniChunkID("1"));
+  }
+
+  @Test
+  public void getBaseChunkID() {
+    Assert.assertEquals("1_1", MiniChunks.getBaseChunkID("1_1_mini"));
+    Assert.assertEquals("1_1", MiniChunks.getBaseChunkID("1_1"));
+    Assert.assertEquals("1", MiniChunks.getBaseChunkID("1"));
+  }
+
+}
--- a/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/QueryParserTest.java
+++ b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/QueryParserTest.java
@ -0,0 +1,20 @@
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.junit.Test;
+
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class QueryParserTest {
+
+  @Test
+  public void getFieldTermsMap() {
+    Map<String, List<String>> map = QueryParser.getFieldTermsMap("content_ja:雨 content_ja:降る");
+    List<String> terms = map.get("content_ja");
+    assertEquals(2, terms.size());
+    assertEquals("雨", terms.get(0));
+    assertEquals("降る", terms.get(1));
+  }
+}