Restore keyword search files

2025-07-06 21:00:22 +00:00 · 2019-10-04 19:20:09 -04:00 · 2019-10-04 19:20:09 -04:00 · bbaf25453f
commit bbaf25453f
parent 2d47246c8d
10 changed files with 1227 additions and 0 deletions
--- a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt
+++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt
@ -0,0 +1,420 @@
 #
 # This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
 #
 # Any token with a part-of-speech tag that exactly matches those defined in this
 # file are removed from the token stream.
 #
 # Set your own stoptags by uncommenting the lines below.  Note that comments are
 # not allowed on the same line as a stoptag.  See LUCENE-3745 for frequency lists,
 # etc. that can be useful for building you own stoptag set.
 #
 # The entire possible tagset is provided below for convenience.
 #
 #####
 #  noun: unclassified nouns
 #名詞
 #
 #  noun-common: Common nouns or nouns where the sub-classification is undefined
 #名詞-一般
 #
 #  noun-proper: Proper nouns where the sub-classification is undefined 
 #名詞-固有名詞
 #
 #  noun-proper-misc: miscellaneous proper nouns
 #名詞-固有名詞-一般
 #
 #  noun-proper-person: Personal names where the sub-classification is undefined
 #名詞-固有名詞-人名
 #
 #  noun-proper-person-misc: names that cannot be divided into surname and 
 #  given name; foreign names; names where the surname or given name is unknown.
 #  e.g. お市の方
 #名詞-固有名詞-人名-一般
 #
 #  noun-proper-person-surname: Mainly Japanese surnames.
 #  e.g. 山田
 #名詞-固有名詞-人名-姓
 #
 #  noun-proper-person-given_name: Mainly Japanese given names.
 #  e.g. 太郎
 #名詞-固有名詞-人名-名
 #
 #  noun-proper-organization: Names representing organizations.
 #  e.g. 通産省, NHK
 #名詞-固有名詞-組織
 #
 #  noun-proper-place: Place names where the sub-classification is undefined
 #名詞-固有名詞-地域
 #
 #  noun-proper-place-misc: Place names excluding countries.
 #  e.g. アジア, バルセロナ, 京都
 #名詞-固有名詞-地域-一般
 #
 #  noun-proper-place-country: Country names. 
 #  e.g. 日本, オーストラリア
 #名詞-固有名詞-地域-国
 #
 #  noun-pronoun: Pronouns where the sub-classification is undefined
 #名詞-代名詞
 #
 #  noun-pronoun-misc: miscellaneous pronouns: 
 #  e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
 #名詞-代名詞-一般
 #
 #  noun-pronoun-contraction: Spoken language contraction made by combining a 
 #  pronoun and the particle 'wa'.
 #  e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ 
 #名詞-代名詞-縮約
 #
 #  noun-adverbial: Temporal nouns such as names of days or months that behave 
 #  like adverbs. Nouns that represent amount or ratios and can be used adverbially,
 #  e.g. 金曜, 一月, 午後, 少量
 #名詞-副詞可能
 #
 #  noun-verbal: Nouns that take arguments with case and can appear followed by 
 #  'suru' and related verbs (する, できる, なさる, くださる)
 #  e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
 #名詞-サ変接続
 #
 #  noun-adjective-base: The base form of adjectives, words that appear before な ("na")
 #  e.g. 健康, 安易, 駄目, だめ
 #名詞-形容動詞語幹
 #
 #  noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
 #  e.g. 0, 1, 2, 何, 数, 幾
 #名詞-数
 #
 #  noun-affix: noun affixes where the sub-classification is undefined
 #名詞-非自立
 #
 #  noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that 
 #  attach to the base form of inflectional words, words that cannot be classified 
 #  into any of the other categories below. This category includes indefinite nouns.
 #  e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, 
 #       順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, 
 #       拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
 #       わり, 割り, 割, ん-口語/, もん-口語/
 #名詞-非自立-一般
 #
 #  noun-affix-adverbial: noun affixes that that can behave as adverbs.
 #  e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, 
 #       上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, 
 #       最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, 
 #       とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, 
 #       儘, 侭, みぎり, 矢先
 #名詞-非自立-副詞可能
 #
 #  noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars 
 #  with the stem よう(だ) ("you(da)").
 #  e.g.  よう, やう, 様 (よう)
 #名詞-非自立-助動詞語幹
 #  
 #  noun-affix-adjective-base: noun affixes that can connect to the indeclinable
 #  connection form な (aux "da").
 #  e.g. みたい, ふう
 #名詞-非自立-形容動詞語幹
 #
 #  noun-special: special nouns where the sub-classification is undefined.
 #名詞-特殊
 #
 #  noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is 
 #  treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base 
 #  form of inflectional words.
 #  e.g. そう
 #名詞-特殊-助動詞語幹
 #
 #  noun-suffix: noun suffixes where the sub-classification is undefined.
 #名詞-接尾
 #
 #  noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect 
 #  to ガル or タイ and can combine into compound nouns, words that cannot be classified into
 #  any of the other categories below. In general, this category is more inclusive than 
 #  接尾語 ("suffix") and is usually the last element in a compound noun.
 #  e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (～した) さ, 次第, 済 (ず) み,
 #       よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
 #名詞-接尾-一般
 #
 #  noun-suffix-person: Suffixes that form nouns and attach to person names more often
 #  than other nouns.
 #  e.g. 君, 様, 著
 #名詞-接尾-人名
 #
 #  noun-suffix-place: Suffixes that form nouns and attach to place names more often 
 #  than other nouns.
 #  e.g. 町, 市, 県
 #名詞-接尾-地域
 #
 #  noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that 
 #  can appear before スル ("suru").
 #  e.g. 化, 視, 分け, 入り, 落ち, 買い
 #名詞-接尾-サ変接続
 #
 #  noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, 
 #  is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the 
 #  conjunctive form of inflectional words.
 #  e.g. そう
 #名詞-接尾-助動詞語幹
 #
 #  noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive 
 #  form of inflectional words and appear before the copula だ ("da").
 #  e.g. 的, げ, がち
 #名詞-接尾-形容動詞語幹
 #
 #  noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
 #  e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ)
 #名詞-接尾-副詞可能
 #
 #  noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category 
 #  is more inclusive than 助数詞 ("classifier") and includes common nouns that attach 
 #  to numbers.
 #  e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半
 #名詞-接尾-助数詞
 #
 #  noun-suffix-special: Special suffixes that mainly attach to inflecting words.
 #  e.g. (楽し) さ, (考え) 方
 #名詞-接尾-特殊
 #
 #  noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words 
 #  together.
 #  e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦)
 #名詞-接続詞的
 #
 #  noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are 
 #  semantically verb-like.
 #  e.g. ごらん, ご覧, 御覧, 頂戴
 #名詞-動詞非自立的
 #
 #  noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, 
 #  dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") 
 #  is いわく ("iwaku").
 #名詞-引用文字列
 #
 #  noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and
 #  behave like an adjective.
 #  e.g. 申し訳, 仕方, とんでも, 違い
 #名詞-ナイ形容詞語幹
 #
 #####
 #  prefix: unclassified prefixes
 #接頭詞
 #
 #  prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) 
 #  excluding numerical expressions.
 #  e.g. お (水), 某 (氏), 同 (社), 故 (～氏), 高 (品質), お (見事), ご (立派)
 #接頭詞-名詞接続
 #
 #  prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
 #  in conjunctive form followed by なる/なさる/くださる.
 #  e.g. お (読みなさい), お (座り)
 #接頭詞-動詞接続
 #
 #  prefix-adjectival: Prefixes that attach to adjectives.
 #  e.g. お (寒いですねえ), バカ (でかい)
 #接頭詞-形容詞接続
 #
 #  prefix-numerical: Prefixes that attach to numerical expressions.
 #  e.g. 約, およそ, 毎時
 #接頭詞-数接続
 #
 #####
 #  verb: unclassified verbs
 #動詞
 #
 #  verb-main:
 #動詞-自立
 #
 #  verb-auxiliary:
 #動詞-非自立
 #
 #  verb-suffix:
 #動詞-接尾
 #
 #####
 #  adjective: unclassified adjectives
 #形容詞
 #
 #  adjective-main:
 #形容詞-自立
 #
 #  adjective-auxiliary:
 #形容詞-非自立
 #
 #  adjective-suffix:
 #形容詞-接尾
 #
 #####
 #  adverb: unclassified adverbs
 #副詞
 #
 #  adverb-misc: Words that can be segmented into one unit and where adnominal 
 #  modification is not possible.
 #  e.g. あいかわらず, 多分
 #副詞-一般
 #
 #  adverb-particle_conjunction: Adverbs that can be followed by の, は, に, 
 #  な, する, だ, etc.
 #  e.g. こんなに, そんなに, あんなに, なにか, なんでも
 #副詞-助詞類接続
 #
 #####
 #  adnominal: Words that only have noun-modifying forms.
 #  e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, 
 #       どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, 
 #       「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き
 #連体詞
 #
 #####
 #  conjunction: Conjunctions that can occur independently.
 #  e.g. が, けれども, そして, じゃあ, それどころか
 接続詞
 #
 #####
 #  particle: unclassified particles.
 助詞
 #
 #  particle-case: case particles where the subclassification is undefined.
 助詞-格助詞
 #
 #  particle-case-misc: Case particles.
 #  e.g. から, が, で, と, に, へ, より, を, の, にて
 助詞-格助詞-一般
 #
 #  particle-case-quote: the "to" that appears after nouns, a person’s speech, 
 #  quotation marks, expressions of decisions from a meeting, reasons, judgements,
 #  conjectures, etc.
 #  e.g. ( だ) と (述べた.), ( である) と (して執行猶予...)
 助詞-格助詞-引用
 #
 #  particle-case-compound: Compounds of particles and verbs that mainly behave 
 #  like case particles.
 #  e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って,
 #       にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, 
 #       にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, 
 #       に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, 
 #       に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって,
 #       にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, 
 #       にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる,
 #       って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ
 助詞-格助詞-連語
 #
 #  particle-conjunctive:
 #  e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, 
 #       ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, 
 #       (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/
 助詞-接続助詞
 #
 #  particle-dependency:
 #  e.g. こそ, さえ, しか, すら, は, も, ぞ
 助詞-係助詞
 #
 #  particle-adverbial:
 #  e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, 
 #       (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/,
 #       (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, 
 #       (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/,
 #       ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」)
 助詞-副助詞
 #
 #  particle-interjective: particles with interjective grammatical roles.
 #  e.g. (松島) や
 助詞-間投助詞
 #
 #  particle-coordinate:
 #  e.g. と, たり, だの, だり, とか, なり, や, やら
 助詞-並立助詞
 #
 #  particle-final:
 #  e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, 
 #       ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/
 助詞-終助詞
 #
 #  particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is 
 #  adverbial, conjunctive, or sentence final. For example:
 #       (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」
 #       (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」
 #           「(祈りが届いたせい) か (, 試験に合格した.)」
 #       (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」
 #  e.g. か
 助詞-副助詞／並立助詞／終助詞
 #
 #  particle-adnominalizer: The "no" that attaches to nouns and modifies 
 #  non-inflectional words.
 助詞-連体化
 #
 #  particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs 
 #  that are giongo, giseigo, or gitaigo.
 #  e.g. に, と
 助詞-副詞化
 #
 #  particle-special: A particle that does not fit into one of the above classifications. 
 #  This includes particles that are used in Tanka, Haiku, and other poetry.
 #  e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家)
 助詞-特殊
 #
 #####
 #  auxiliary-verb:
 助動詞
 #
 #####
 #  interjection: Greetings and other exclamations.
 #  e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, 
 #       いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
 #感動詞
 #
 #####
 #  symbol: unclassified Symbols.
 記号
 #
 #  symbol-misc: A general symbol not in one of the categories below.
 #  e.g. [○◎@$〒→+]
 記号-一般
 #
 #  symbol-comma: Commas
 #  e.g. [,、]
 記号-読点
 #
 #  symbol-period: Periods and full stops.
 #  e.g. [.．。]
 記号-句点
 #
 #  symbol-space: Full-width whitespace.
 記号-空白
 #
 #  symbol-open_bracket:
 #  e.g. [({‘“『【]
 記号-括弧開
 #
 #  symbol-close_bracket:
 #  e.g. [)}’”』」】]
 記号-括弧閉
 #
 #  symbol-alphabetic:
 #記号-アルファベット
 #
 #####
 #  other: unclassified other
 #その他
 #
 #  other-interjection: Words that are hard to classify as noun-suffixes or 
 #  sentence-final particles.
 #  e.g. (だ)ァ
 その他-間投
 #
 #####
 #  filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
 #  e.g. あの, うんと, えと
 フィラー
 #
 #####
 #  non-verbal: non-verbal sound.
 非言語音
 #
 #####
 #  fragment:
 #語断片
 #
 #####
 #  unknown: unknown part of speech.
 #未知語
 #
 ##### End of file
--- a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt
+++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt
@ -0,0 +1,127 @@
 #
 # This file defines a stopword set for Japanese.
 #
 # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
 # Punctuation characters and frequent kanji have mostly been left out.  See LUCENE-3745
 # for frequency lists, etc. that can be useful for making your own set (if desired)
 #
 # Note that there is an overlap between these stopwords and the terms stopped when used
 # in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 # that comments are not allowed on the same line as stopwords.
 #
 # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
 # configuration if you need case-sensitive stopping.  Lastly, note that stopping is done
 # using the same character width as the entries in this file.  Since this StopFilter is
 # normally done after a CJKWidthFilter in your chain, you would usually want your romaji
 # entries to be in half-width and your kana entries to be in full-width.
 #
 の
 に
 は
 を
 た
 が
 で
 て
 と
 し
 れ
 さ
 ある
 いる
 も
 する
 から
 な
 こと
 として
 い
 や
 れる
 など
 なっ
 ない
 この
 ため
 その
 あっ
 よう
 また
 もの
 という
 あり
 まで
 られ
 なる
 へ
 か
 だ
 これ
 によって
 により
 おり
 より
 による
 ず
 なり
 られる
 において
 ば
 なかっ
 なく
 しかし
 について
 せ
 だっ
 その後
 できる
 それ
 う
 ので
 なお
 のみ
 でき
 き
 つ
 における
 および
 いう
 さらに
 でも
 ら
 たり
 その他
 に関する
 たち
 ます
 ん
 なら
 に対して
 特に
 せる
 及び
 これら
 とき
 では
 にて
 ほか
 ながら
 うち
 そして
 とともに
 ただし
 かつて
 それぞれ
 または
 お
 ほど
 ものの
 に対する
 ほとんど
 と共に
 といった
 です
 とも
 ところ
 ここ
 ##### End of file
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Language.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Language.java
@ -0,0 +1,46 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.util.Arrays;
 import java.util.Optional;
 /**
 * Language.
 *
 * Contents which are detected to have these languages should be indexed to a corresponding language-specific field
 * such as content_ja.
 */
 public enum Language {
    JAPANESE("ja");
    private String value;
    String getValue() {
        return value;
    }
    static Optional<Language> fromValue(String value) {
        return Arrays.stream(Language.values()).filter(x -> x.value.equals(value)).findFirst();
    }
    Language(String value) {
        this.value = value;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java
@ -0,0 +1,60 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import com.optimaize.langdetect.LanguageDetectorBuilder;
 import com.optimaize.langdetect.i18n.LdLocale;
 import com.optimaize.langdetect.ngram.NgramExtractors;
 import com.optimaize.langdetect.profiles.LanguageProfileReader;
 import com.optimaize.langdetect.text.CommonTextObjectFactories;
 import com.optimaize.langdetect.text.TextObject;
 import com.optimaize.langdetect.text.TextObjectFactory;
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.Optional;
 /**
 * Detects the language of the given contents. Only languages which should be indexed to a corresponding
 * language-specific field are detected.
 */
 class LanguageDetector {
    private com.optimaize.langdetect.LanguageDetector impl;
    private TextObjectFactory textObjectFactory;
    LanguageDetector() {
        try {
            impl = LanguageDetectorBuilder.create(NgramExtractors.standard())
                .withProfiles(new LanguageProfileReader().readAllBuiltIn())
                .build();
            textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
        } catch (IOException e) {
            // The IOException here could occur when failing to read the language profiles from the classpath.
            // That can be considered to be a severe IO problem. Nothing can be done here.
            throw new UncheckedIOException(e);
        }
    }
    Optional<Language> detect(String text) {
        TextObject textObject = textObjectFactory.forText(text);
        Optional<LdLocale> localeOpt = impl.detect(textObject).transform(Optional::of).or(Optional.empty());
        return localeOpt.map(LdLocale::getLanguage).flatMap(Language::fromValue);
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java
@ -0,0 +1,85 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.solr.common.SolrInputDocument;
 import org.openide.util.NbBundle;
 import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
 import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 /**
 * A helper class to support indexing language-specific fields.
 */
 class LanguageSpecificContentIndexingHelper {
    private final LanguageDetector languageDetector = new LanguageDetector();
    Optional<Language> detectLanguageIfNeeded(Chunker.Chunk chunk) throws NoOpenCoreException {
        double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
        if (2.2 <= indexSchemaVersion) {
            return languageDetector.detect(chunk.toString());
        } else {
            return Optional.empty();
        }
    }
    void updateLanguageSpecificFields(Map<String, Object> fields, Chunker.Chunk chunk, Language language) {
        List<String> values = new ArrayList<>();
        values.add(chunk.toString());
        if (fields.containsKey(Server.Schema.FILE_NAME.toString())) {
            values.add(fields.get(Server.Schema.FILE_NAME.toString()).toString());
        }
        // index the chunk to a language specific field
        fields.put(Server.Schema.CONTENT_JA.toString(), values);
        fields.put(Server.Schema.LANGUAGE.toString(), language.getValue());
    }
    void indexMiniChunk(Chunker.Chunk chunk, String sourceName, Map<String, Object> fields, String baseChunkID, Language language)
        throws Ingester.IngesterException {
        //Make a SolrInputDocument out of the field map
        SolrInputDocument updateDoc = new SolrInputDocument();
        for (String key : fields.keySet()) {
            updateDoc.addField(key, fields.get(key));
        }
        try {
            updateDoc.setField(Server.Schema.ID.toString(), MiniChunkHelper.getChunkIdString(baseChunkID));
            // index the chunk to a language specific field
            updateDoc.addField(Server.Schema.CONTENT_JA.toString(), chunk.toString().substring(chunk.getBaseChunkLength()));
            updateDoc.addField(Server.Schema.LANGUAGE.toString(), language.getValue());
            TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
            KeywordSearch.getServer().addDocument(updateDoc);
            HealthMonitor.submitTimingMetric(metric);
        } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
            throw new Ingester.IngesterException(
                NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
        }
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java
@ -0,0 +1,248 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.sleuthkit.autopsy.coreutils.EscapeUtil;
 import org.sleuthkit.autopsy.coreutils.Version;
 import org.sleuthkit.datamodel.TskException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;
 /**
 * A helper class to support querying documents which have language-specific fields.
 */
 final class LanguageSpecificContentQueryHelper {
    private LanguageSpecificContentQueryHelper() {}
    private static final List<Server.Schema> QUERY_FIELDS = new ArrayList<>();
    private static final List<Server.Schema> LANGUAGE_SPECIFIC_CONTENT_FIELDS
        = Collections.singletonList(Server.Schema.CONTENT_JA);
    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
    static {
        QUERY_FIELDS.add(Server.Schema.TEXT);
        QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS);
    }
    /**
     * Holds query response for later processes related to language-specific fields
     */
    static class QueryResults {
        List<SolrDocument> chunks = new ArrayList<>();
        Map</* ID */ String, SolrDocument> miniChunks = new HashMap<>();
        // objectId_chunk -> "text" -> List of previews
        Map<String, Map<String, List<String>>> highlighting = new HashMap<>();
    }
    /**
     * Make a query string from the given one by applying it to the multiple query fields
     *
     * @param queryStr escaped query string
     * @return query string
     */
    static String expandQueryString(final String queryStr) {
        List<String> fieldQueries = new ArrayList<>();
        fieldQueries.add(Server.Schema.TEXT.toString() + ":" + queryStr);
        fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + ":" + queryStr).collect(Collectors.toList()));
        return String.join(" OR ", fieldQueries);
    }
    static List<Server.Schema> getQueryFields() {
        return QUERY_FIELDS;
    }
    static void updateQueryResults(QueryResults results, SolrDocument document) {
        String id = (String) document.getFieldValue(Server.Schema.ID.toString());
        if (MiniChunkHelper.isMiniChunkID(id)) {
            results.miniChunks.put(MiniChunkHelper.getBaseChunkID(id), document);
        } else {
            results.chunks.add(document);
        }
    }
    /**
     * Get snippets
     *
     * @param highlight field ID -> snippets
     * @return snippets of appropriate fields.
     * Note that this method returns {@code Optional.empty} if the result is empty for convenience to interact with the existing code.
     */
    static Optional<List<String>> getHighlights(Map<String, List<String>> highlight) {
        for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
            if (highlight.containsKey(field.toString())) {
                return Optional.of(highlight.get(field.toString()));
            }
        }
        return Optional.empty();
    }
    /**
     * Merge KeywordHits from TEXT field and a language specific field
     *
     * Replace KeywordHits in the given {@code matches} if its chunk ID is same.
     */
    static List<KeywordHit> mergeKeywordHits(List<KeywordHit> matches, Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
        Map<String, KeywordHit> map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x));
        List<KeywordHit> merged = new ArrayList<>();
        // first, replace KeywordHit in matches
        for (KeywordHit match : matches) {
            String key = match.getSolrDocumentId();
            if (map.containsKey(key)) {
                merged.add(map.get(key));
                map.remove(key);
            } else {
                merged.add(match);
            }
        }
        // second, add rest of KeywordHits from queryResults
        merged.addAll(map.values());
        return merged;
    }
    static void configureTermfreqQuery(SolrQuery query, String keyword) throws KeywordSearchModuleException, NoOpenCoreException {
        // make a request to Solr to parse query.
        QueryTermHelper.Result queryParserResult = QueryTermHelper.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS);
        query.addField(buildTermfreqQuery(keyword, queryParserResult));
    }
    static String buildTermfreqQuery(String keyword, QueryTermHelper.Result result) {
        List<String> termfreqs = new ArrayList<>();
        for (Map.Entry<String, List<String>> e : result.fieldTermsMap.entrySet()) {
            String field = e.getKey();
            for (String term : e.getValue()) {
                termfreqs.add(String.format("termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term)));
            }
        }
        // sum of all language specific query fields.
        // only one of these fields could be non-zero.
        return String.format("termfreq:sum(%s)", String.join(",", termfreqs));
    }
    static int queryChunkTermfreq(Set<String> keywords, String contentID) throws KeywordSearchModuleException, NoOpenCoreException {
        SolrQuery q = new SolrQuery();
        q.setShowDebugInfo(DEBUG);
        final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentID);
        final String highlightQuery = keywords.stream()
            .map(s -> LanguageSpecificContentQueryHelper.expandQueryString(
                KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
            .collect(Collectors.joining(" "));
        q.addFilterQuery(filterQuery);
        q.setQuery(highlightQuery);
        LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
        QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
        SolrDocumentList results = response.getResults();
        if (results.isEmpty()) {
            return 0;
        }
        SolrDocument document = results.get(0);
        return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
    }
    static int findNthIndexOf(String s, String pattern, int n) {
        int found = 0;
        int idx = -1;
        int len = s.length();
        while (idx < len && found <= n) {
            idx = s.indexOf(pattern, idx + 1);
            if (idx == -1) {
                break;
            }
            found++;
        }
        return idx;
    }
    private static List<KeywordHit> findMatches(Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
        List<KeywordHit> matches = new ArrayList<>();
        for (SolrDocument document : queryResults.chunks) {
            String docId = (String) document.getFieldValue(Server.Schema.ID.toString());
            try {
                int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
                SolrDocument miniChunk = queryResults.miniChunks.get(docId);
                if (miniChunk == null) {
                    // last chunk does not have mini chunk because there's no overlapped region with next one
                    matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
                } else {
                    int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
                    if (hitCountInMiniChunk < hitCountInChunk) {
                        // there are at least one hit in base chunk
                        matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
                    }
                }
            } catch (TskException ex) {
                throw new KeywordSearchModuleException(ex);
            }
        }
        return matches;
    }
    /**
     * copied from LuceneQuery and modified to use getHighlightFieldValue
     */
    private static KeywordHit createKeywordHit(Keyword originalKeyword, Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {
        /**
         * Get the first snippet from the document if keyword search is
         * configured to use snippets.
         */
        String snippet = "";
        if (KeywordSearchSettings.getShowSnippets()) {
            List<String> snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null);
            // list is null if there wasn't a snippet
            if (snippetList != null) {
                snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
            }
        }
        return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());
    }
    /**
     * @return Optional.empty if empty
     */
    private static Optional<List<String>> getHighlightFieldValue(Map<String, List<String>> highlight) {
        for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
            if (highlight.containsKey(field.toString())) {
                return Optional.of(highlight.get(field.toString()));
            }
        }
        return Optional.empty();
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelper.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelper.java
@ -0,0 +1,41 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 /**
 * Mini-chunk related methods.
 */
 final class MiniChunkHelper {
    private MiniChunkHelper() {}
    static String SUFFIX = "_mini";
    static String getChunkIdString(String baseChunkID) {
        return baseChunkID + SUFFIX;
    }
    static boolean isMiniChunkID(String chunkID) {
        return chunkID.endsWith(SUFFIX);
    }
    static String getBaseChunkID(String miniChunkID) {
        return miniChunkID.replaceFirst(SUFFIX + "$", "");
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryTermHelper.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryTermHelper.java
@ -0,0 +1,95 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.request.FieldAnalysisRequest;
 import org.apache.solr.client.solrj.response.AnalysisResponseBase;
 import org.apache.solr.client.solrj.response.FieldAnalysisResponse;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
 /**
 * Get terms from query using Solr.
 *
 * This class is used to find matched terms from query results.
 */
 final class QueryTermHelper {
    private QueryTermHelper() {}
    /**
     * Result of {@link #parse} method
     */
    static class Result {
        /**
         * field name -> [term]
         */
        final Map<String, List<String>> fieldTermsMap = new HashMap<>();
    }
    /**
     * Parse the given query string on Solr and return the result
     *
     * @param query query to parse
     * @param fields field names to use for parsing
     */
    static Result parse(String query, List<Server.Schema> fields) throws KeywordSearchModuleException, NoOpenCoreException {
        Server server = KeywordSearch.getServer();
        FieldAnalysisRequest request = new FieldAnalysisRequest();
        for (Server.Schema field : fields) {
            request.addFieldName(field.toString());
        }
        // FieldAnalysisRequest requires to set its field value property,
        // while the corresponding analysis.fieldvalue parameter is not needed in the API.
        // Setting an empty value does not effect on the result.
        request.setFieldValue("");
        request.setQuery(query);
        FieldAnalysisResponse response = new FieldAnalysisResponse();
        try {
            response.setResponse(server.request(request));
        } catch (SolrServerException e) {
            throw new KeywordSearchModuleException(e);
        }
        Result result = new Result();
        for (Map.Entry<String, FieldAnalysisResponse.Analysis> entry : response.getAllFieldNameAnalysis()) {
            Iterator<AnalysisResponseBase.AnalysisPhase> it = entry.getValue().getQueryPhases().iterator();
            // The last phase is the one which is used in the search process.
            AnalysisResponseBase.AnalysisPhase lastPhase = null;
            while (it.hasNext()) {
                lastPhase = it.next();
            }
            if (lastPhase != null) {
                List<String> tokens = lastPhase.getTokens().stream().map(AnalysisResponseBase.TokenInfo::getText).collect(Collectors.toList());
                result.fieldTermsMap.put(entry.getKey(), tokens);
            }
        }
        return result;
    }
 }
--- a/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java
+++ b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java
@ -0,0 +1,59 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import org.junit.Test;
 import java.util.Arrays;
 import static org.junit.Assert.assertEquals;
 /**
 * tests for LanguageSpecificContentQueryHelper
 */
 public class LanguageSpecificContentQueryHelperTest {
    @Test
    public void makeQueryString() {
        assertEquals("text:query OR content_ja:query", LanguageSpecificContentQueryHelper.expandQueryString("query"));
    }
    @Test
    public void findNthIndexOf() {
        assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "_", 0));
        assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 0));
        assertEquals(2, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 1));
        assertEquals(3, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 2));
        assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 3));
        assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "", 0));
        assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("", "A", 0));
        assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", -1));
        assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 999));
    }
    @Test
    public void buildTermfreqQuery() {
        QueryTermHelper.Result result = new QueryTermHelper.Result();
        result.fieldTermsMap.put("field1", Arrays.asList("term1"));
        result.fieldTermsMap.put("field2", Arrays.asList("term1", "term2"));
        assertEquals(
            "termfreq:sum(termfreq(\"field1\",\"term1\"),termfreq(\"field2\",\"term1\"),termfreq(\"field2\",\"term2\"))",
            LanguageSpecificContentQueryHelper.buildTermfreqQuery("query", result));
    }
 }
--- a/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelperTest.java
+++ b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelperTest.java
@ -0,0 +1,46 @@
 /*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2019 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import org.junit.Assert;
 import org.junit.Test;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 /**
 * tests for MiniChunkHelper
 */
 public class MiniChunkHelperTest {
    @Test
    public void isMiniChunkID() {
        assertTrue(MiniChunkHelper.isMiniChunkID("1_1_mini"));
        assertFalse(MiniChunkHelper.isMiniChunkID("1_1"));
        assertFalse(MiniChunkHelper.isMiniChunkID("1"));
    }
    @Test
    public void getBaseChunkID() {
        Assert.assertEquals("1_1", MiniChunkHelper.getBaseChunkID("1_1_mini"));
        Assert.assertEquals("1_1", MiniChunkHelper.getBaseChunkID("1_1"));
        Assert.assertEquals("1", MiniChunkHelper.getBaseChunkID("1"));
    }
 }