Japanese search feature

This commit is contained in:
Akira Ueda 2019-08-26 18:57:01 +09:00
parent 478f735a4b
commit 6491c0fe52
19 changed files with 1258 additions and 28 deletions

View File

@ -21,6 +21,7 @@
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
<dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
<dependency conf="autopsy->*" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/>
<!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
<dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
<exclude module="cxf-rt-rs-client"/>

View File

@ -29,6 +29,7 @@ file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
file.reference.language-detector-0.6.jar=release/modules/ext/language-detector-0.6.jar
file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar

View File

@ -467,6 +467,10 @@
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
<binary-origin>release/modules/ext/vorbis-java-tika-0.8.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/language-detector-0.6.jar</runtime-relative-path>
<binary-origin>release/modules/ext/language-detector-0.6.jar</binary-origin>
</class-path-extension>
</data>
</configuration>
</project>

View File

@ -0,0 +1,420 @@
#
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
#
# Any token with a part-of-speech tag that exactly matches those defined in this
# file are removed from the token stream.
#
# Set your own stoptags by uncommenting the lines below. Note that comments are
# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists,
# etc. that can be useful for building you own stoptag set.
#
# The entire possible tagset is provided below for convenience.
#
#####
# noun: unclassified nouns
#名詞
#
# noun-common: Common nouns or nouns where the sub-classification is undefined
#名詞-一般
#
# noun-proper: Proper nouns where the sub-classification is undefined
#名詞-固有名詞
#
# noun-proper-misc: miscellaneous proper nouns
#名詞-固有名詞-一般
#
# noun-proper-person: Personal names where the sub-classification is undefined
#名詞-固有名詞-人名
#
# noun-proper-person-misc: names that cannot be divided into surname and
# given name; foreign names; names where the surname or given name is unknown.
# e.g. お市の方
#名詞-固有名詞-人名-一般
#
# noun-proper-person-surname: Mainly Japanese surnames.
# e.g. 山田
#名詞-固有名詞-人名-姓
#
# noun-proper-person-given_name: Mainly Japanese given names.
# e.g. 太郎
#名詞-固有名詞-人名-名
#
# noun-proper-organization: Names representing organizations.
# e.g. 通産省, NHK
#名詞-固有名詞-組織
#
# noun-proper-place: Place names where the sub-classification is undefined
#名詞-固有名詞-地域
#
# noun-proper-place-misc: Place names excluding countries.
# e.g. アジア, バルセロナ, 京都
#名詞-固有名詞-地域-一般
#
# noun-proper-place-country: Country names.
# e.g. 日本, オーストラリア
#名詞-固有名詞-地域-国
#
# noun-pronoun: Pronouns where the sub-classification is undefined
#名詞-代名詞
#
# noun-pronoun-misc: miscellaneous pronouns:
# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
#名詞-代名詞-一般
#
# noun-pronoun-contraction: Spoken language contraction made by combining a
# pronoun and the particle 'wa'.
# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ
#名詞-代名詞-縮約
#
# noun-adverbial: Temporal nouns such as names of days or months that behave
# like adverbs. Nouns that represent amount or ratios and can be used adverbially,
# e.g. 金曜, 一月, 午後, 少量
#名詞-副詞可能
#
# noun-verbal: Nouns that take arguments with case and can appear followed by
# 'suru' and related verbs (する, できる, なさる, くださる)
# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
#名詞-サ変接続
#
# noun-adjective-base: The base form of adjectives, words that appear before な ("na")
# e.g. 健康, 安易, 駄目, だめ
#名詞-形容動詞語幹
#
# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
# e.g. 0, 1, 2, 何, 数, 幾
#名詞-数
#
# noun-affix: noun affixes where the sub-classification is undefined
#名詞-非自立
#
# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that
# attach to the base form of inflectional words, words that cannot be classified
# into any of the other categories below. This category includes indefinite nouns.
# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第,
# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み,
# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
# わり, 割り, 割, ん-口語/, もん-口語/
#名詞-非自立-一般
#
# noun-affix-adverbial: noun affixes that that can behave as adverbs.
# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ,
# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか,
# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所,
# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま,
# 儘, 侭, みぎり, 矢先
#名詞-非自立-副詞可能
#
# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars
# with the stem よう(だ) ("you(da)").
# e.g. よう, やう, 様 (よう)
#名詞-非自立-助動詞語幹
#
# noun-affix-adjective-base: noun affixes that can connect to the indeclinable
# connection form な (aux "da").
# e.g. みたい, ふう
#名詞-非自立-形容動詞語幹
#
# noun-special: special nouns where the sub-classification is undefined.
#名詞-特殊
#
# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is
# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base
# form of inflectional words.
# e.g. そう
#名詞-特殊-助動詞語幹
#
# noun-suffix: noun suffixes where the sub-classification is undefined.
#名詞-接尾
#
# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect
# to ガル or タイ and can combine into compound nouns, words that cannot be classified into
# any of the other categories below. In general, this category is more inclusive than
# 接尾語 ("suffix") and is usually the last element in a compound noun.
# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み,
# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
#名詞-接尾-一般
#
# noun-suffix-person: Suffixes that form nouns and attach to person names more often
# than other nouns.
# e.g. 君, 様, 著
#名詞-接尾-人名
#
# noun-suffix-place: Suffixes that form nouns and attach to place names more often
# than other nouns.
# e.g. 町, 市, 県
#名詞-接尾-地域
#
# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that
# can appear before スル ("suru").
# e.g. 化, 視, 分け, 入り, 落ち, 買い
#名詞-接尾-サ変接続
#
# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions,
# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the
# conjunctive form of inflectional words.
# e.g. そう
#名詞-接尾-助動詞語幹
#
# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive
# form of inflectional words and appear before the copula だ ("da").
# e.g. 的, げ, がち
#名詞-接尾-形容動詞語幹
#
# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ)
#名詞-接尾-副詞可能
#
# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category
# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach
# to numbers.
# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半
#名詞-接尾-助数詞
#
# noun-suffix-special: Special suffixes that mainly attach to inflecting words.
# e.g. (楽し) さ, (考え) 方
#名詞-接尾-特殊
#
# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words
# together.
# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦)
#名詞-接続詞的
#
# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are
# semantically verb-like.
# e.g. ごらん, ご覧, 御覧, 頂戴
#名詞-動詞非自立的
#
# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry,
# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation")
# is いわく ("iwaku").
#名詞-引用文字列
#
# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and
# behave like an adjective.
# e.g. 申し訳, 仕方, とんでも, 違い
#名詞-ナイ形容詞語幹
#
#####
# prefix: unclassified prefixes
#接頭詞
#
# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
# excluding numerical expressions.
# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派)
#接頭詞-名詞接続
#
# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
# in conjunctive form followed by なる/なさる/くださる.
# e.g. お (読みなさい), お (座り)
#接頭詞-動詞接続
#
# prefix-adjectival: Prefixes that attach to adjectives.
# e.g. お (寒いですねえ), バカ (でかい)
#接頭詞-形容詞接続
#
# prefix-numerical: Prefixes that attach to numerical expressions.
# e.g. 約, およそ, 毎時
#接頭詞-数接続
#
#####
# verb: unclassified verbs
#動詞
#
# verb-main:
#動詞-自立
#
# verb-auxiliary:
#動詞-非自立
#
# verb-suffix:
#動詞-接尾
#
#####
# adjective: unclassified adjectives
#形容詞
#
# adjective-main:
#形容詞-自立
#
# adjective-auxiliary:
#形容詞-非自立
#
# adjective-suffix:
#形容詞-接尾
#
#####
# adverb: unclassified adverbs
#副詞
#
# adverb-misc: Words that can be segmented into one unit and where adnominal
# modification is not possible.
# e.g. あいかわらず, 多分
#副詞-一般
#
# adverb-particle_conjunction: Adverbs that can be followed by の, は, に,
# な, する, だ, etc.
# e.g. こんなに, そんなに, あんなに, なにか, なんでも
#副詞-助詞類接続
#
#####
# adnominal: Words that only have noun-modifying forms.
# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう,
# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした,
# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き
#連体詞
#
#####
# conjunction: Conjunctions that can occur independently.
# e.g. が, けれども, そして, じゃあ, それどころか
接続詞
#
#####
# particle: unclassified particles.
助詞
#
# particle-case: case particles where the subclassification is undefined.
助詞-格助詞
#
# particle-case-misc: Case particles.
# e.g. から, が, で, と, に, へ, より, を, の, にて
助詞-格助詞-一般
#
# particle-case-quote: the "to" that appears after nouns, a persons speech,
# quotation marks, expressions of decisions from a meeting, reasons, judgements,
# conjectures, etc.
# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...)
助詞-格助詞-引用
#
# particle-case-compound: Compounds of particles and verbs that mainly behave
# like case particles.
# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って,
# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける,
# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し,
# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして,
# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって,
# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る,
# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる,
# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ
助詞-格助詞-連語
#
# particle-conjunctive:
# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども,
# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/,
# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/
助詞-接続助詞
#
# particle-dependency:
# e.g. こそ, さえ, しか, すら, は, も, ぞ
助詞-係助詞
#
# particle-adverbial:
# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/,
# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/,
# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに,
# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/,
# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」)
助詞-副助詞
#
# particle-interjective: particles with interjective grammatical roles.
# e.g. (松島) や
助詞-間投助詞
#
# particle-coordinate:
# e.g. と, たり, だの, だり, とか, なり, や, やら
助詞-並立助詞
#
# particle-final:
# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ,
# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/
助詞-終助詞
#
# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is
# adverbial, conjunctive, or sentence final. For example:
# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」
# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」
# 「(祈りが届いたせい) か (, 試験に合格した.)」
# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」
# e.g. か
助詞-副助詞/並立助詞/終助詞
#
# particle-adnominalizer: The "no" that attaches to nouns and modifies
# non-inflectional words.
助詞-連体化
#
# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs
# that are giongo, giseigo, or gitaigo.
# e.g. に, と
助詞-副詞化
#
# particle-special: A particle that does not fit into one of the above classifications.
# This includes particles that are used in Tanka, Haiku, and other poetry.
# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家)
助詞-特殊
#
#####
# auxiliary-verb:
助動詞
#
#####
# interjection: Greetings and other exclamations.
# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます,
# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
#感動詞
#
#####
# symbol: unclassified Symbols.
記号
#
# symbol-misc: A general symbol not in one of the categories below.
# e.g. [○◎@$〒→+]
記号-一般
#
# symbol-comma: Commas
# e.g. [,、]
記号-読点
#
# symbol-period: Periods and full stops.
# e.g. [..。]
記号-句点
#
# symbol-space: Full-width whitespace.
記号-空白
#
# symbol-open_bracket:
# e.g. [({‘“『【]
記号-括弧開
#
# symbol-close_bracket:
# e.g. [)}’”』」】]
記号-括弧閉
#
# symbol-alphabetic:
#記号-アルファベット
#
#####
# other: unclassified other
#その他
#
# other-interjection: Words that are hard to classify as noun-suffixes or
# sentence-final particles.
# e.g. (だ)ァ
その他-間投
#
#####
# filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
# e.g. あの, うんと, えと
フィラー
#
#####
# non-verbal: non-verbal sound.
非言語音
#
#####
# fragment:
#語断片
#
#####
# unknown: unknown part of speech.
#未知語
#
##### End of file

View File

@ -0,0 +1,127 @@
#
# This file defines a stopword set for Japanese.
#
# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745
# for frequency lists, etc. that can be useful for making your own set (if desired)
#
# Note that there is an overlap between these stopwords and the terms stopped when used
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
# that comments are not allowed on the same line as stopwords.
#
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
# configuration if you need case-sensitive stopping. Lastly, note that stopping is done
# using the same character width as the entries in this file. Since this StopFilter is
# normally done after a CJKWidthFilter in your chain, you would usually want your romaji
# entries to be in half-width and your kana entries to be in full-width.
#
ある
いる
する
から
こと
として
れる
など
なっ
ない
この
ため
その
あっ
よう
また
もの
という
あり
まで
られ
なる
これ
によって
により
おり
より
による
なり
られる
において
なかっ
なく
しかし
について
だっ
その後
できる
それ
ので
なお
のみ
でき
における
および
いう
さらに
でも
たり
その他
に関する
たち
ます
なら
に対して
特に
せる
及び
これら
とき
では
にて
ほか
ながら
うち
そして
とともに
ただし
かつて
それぞれ
または
ほど
ものの
に対する
ほとんど
と共に
といった
です
とも
ところ
ここ
##### End of file

View File

@ -243,6 +243,18 @@
</analyzer>
</fieldType>
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field with defaults appropriate for English: it
tokenizes with StandardTokenizer, removes English stop words
(stopwords_en.txt), down cases, protects words from protwords.txt, and
@ -557,6 +569,11 @@
via copyField further on in this schema -->
<field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
<!-- Store language detection result. Only parents of text documents have this -->
<field name="language" type="string" indexed="false" stored="true" required="false"/>
<field name="content_ja" type="text_ja" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->

View File

@ -38,6 +38,7 @@ import org.apache.commons.lang3.math.NumberUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
@ -346,6 +347,8 @@ class HighlightedText implements IndexedText {
String chunkID = "";
String highlightField = "";
try {
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
loadPageInfo(); //inits once
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
@ -359,16 +362,33 @@ class HighlightedText implements IndexedText {
highlightField = LuceneQuery.HIGHLIGHT_FIELD;
if (isLiteral) {
//if the query is literal try to get solr to do the highlighting
final String highlightQuery = keywords.stream()
.map(HighlightedText::constructEscapedSolrQuery)
.collect(Collectors.joining(" "));
if (2.2 <= indexSchemaVersion) {
//if the query is literal try to get solr to do the highlighting
final String highlightQuery = keywords.stream().map(s ->
LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.escapeLuceneQuery(s)))
.collect(Collectors.joining(" OR "));
q.setQuery(highlightQuery);
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
q.addField(field.toString());
q.addHighlightField(field.toString());
}
q.addField(Server.Schema.LANGUAGE.toString());
// in case of single term literal query there is only 1 term
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
q.addFilterQuery(filterQuery);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
} else {
//if the query is literal try to get solr to do the highlighting
final String highlightQuery = keywords.stream()
.map(HighlightedText::constructEscapedSolrQuery)
.collect(Collectors.joining(" "));
q.setQuery(highlightQuery);
q.addField(highlightField);
q.addFilterQuery(filterQuery);
q.addHighlightField(highlightField);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
q.setQuery(highlightQuery);
q.addField(highlightField);
q.addFilterQuery(filterQuery);
q.addHighlightField(highlightField);
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
}
//tune the highlighter
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
@ -406,12 +426,40 @@ class HighlightedText implements IndexedText {
if (responseHighlightID == null) {
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
} else {
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
if (contentHighlights == null) {
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
SolrDocument document = response.getResults().get(0);
Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
if (2.2 <= indexSchemaVersion && language != null) {
List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
if (contentHighlights == null) {
highlightedContent = "";
} else {
int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunks.getChunkIdString(contentIdStr));
String s = contentHighlights.get(0).trim();
// If there is a mini-chunk, trim the content not to show highlighted text in it.
if (0 < hitCountInMiniChunk) {
int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
s,
HIGHLIGHT_PRE,
// trim after the last hit in chunk
hitCountInChunk - hitCountInMiniChunk);
if (idx != -1) {
highlightedContent = s.substring(0, idx);
} else {
highlightedContent = s;
}
} else {
highlightedContent = s;
}
}
} else {
// extracted content (minus highlight tags) is HTML-escaped
highlightedContent = contentHighlights.get(0).trim();
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
if (contentHighlights == null) {
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
} else {
// extracted content (minus highlight tags) is HTML-escaped
highlightedContent = contentHighlights.get(0).trim();
}
}
}
}

View File

@ -39,7 +39,7 @@ class IndexFinder {
private static final String KWS_DATA_FOLDER_NAME = "data";
private static final String INDEX_FOLDER_NAME = "index";
private static final String CURRENT_SOLR_VERSION = "4";
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.1";
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.2";
static String getCurrentSolrVersion() {
return CURRENT_SOLR_VERSION;

View File

@ -20,8 +20,10 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.BufferedReader;
import java.io.Reader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.solr.client.solrj.SolrServerException;
@ -59,6 +61,8 @@ class Ingester {
private final Server solrServer = KeywordSearch.getServer();
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
private static Ingester instance;
private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
= new LanguageSpecificContentIndexingHelper();
private Ingester() {
}
@ -93,7 +97,7 @@ class Ingester {
* file, but the Solr server is probably fine.
*/
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
indexChunk("", file.getName().toLowerCase(), getContentFields(file));
indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
}
/**
@ -107,7 +111,7 @@ class Ingester {
* artifact, but the Solr server is probably fine.
*/
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
indexChunk("", sourceName, getContentFields(artifact));
indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
}
/**
@ -143,21 +147,30 @@ class Ingester {
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
int numChunks = 0; //unknown until chunking is done
Map<String, String> fields = getContentFields(source);
Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
//Get a reader for the content of the given source
try (BufferedReader reader = new BufferedReader(sourceReader)) {
Chunker chunker = new Chunker(reader);
for (Chunk chunk : chunker) {
while (chunker.hasNext()) {
Chunk chunk = chunker.next();
if (context != null && context.fileIngestIsCancelled()) {
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
return false;
}
Map<String, Object> fields = new HashMap<>(contentFields);
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
fields.put(Server.Schema.ID.toString(), chunkId);
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
Optional<LanguageSpecificContentIndexingHelper.Language> language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
try {
//add the chunk text to Solr index
indexChunk(chunk.toString(), sourceName, fields);
// add mini chunk when there's a language specific field
if (chunker.hasNext() && language.isPresent()) {
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
}
numChunks++;
} catch (Ingester.IngesterException ingEx) {
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
@ -177,6 +190,7 @@ class Ingester {
if (context != null && context.fileIngestIsCancelled()) {
return false;
} else {
Map<String, Object> fields = new HashMap<>(contentFields);
//after all chunks, index just the meta data, including the numChunks, of the parent file
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
//reset id field to base document id
@ -202,7 +216,7 @@ class Ingester {
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
//JMTODO: actually if the we couldn't get the image id it is set to -1,
// but does this really mean we don't want to index it?

View File

@ -0,0 +1,55 @@
package org.sleuthkit.autopsy.keywordsearch;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
class LanguageDetector {
public enum Language {
JAPANESE,
ENGLISH,
}
private List<LanguageProfile> languageProfiles;
LanguageDetector() {
try {
languageProfiles = Arrays.asList(
new LanguageProfileReader().readBuiltIn(LdLocale.fromString("en")),
new LanguageProfileReader().readBuiltIn(LdLocale.fromString("ja"))
);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
Optional<Language> detect(String text) {
com.optimaize.langdetect.LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
TextObject textObject = textObjectFactory.forText(text);
return languageDetector.detect(textObject).transform(Optional::of).or(Optional.empty()).map(LdLocale::getLanguage).flatMap(LanguageDetector::toLanguage);
}
private static Optional<Language> toLanguage(String s) {
switch (s) {
case "ja": return Optional.of(Language.JAPANESE);
case "en": return Optional.of(Language.ENGLISH);
default: return Optional.empty();
}
}
}

View File

@ -0,0 +1,90 @@
package org.sleuthkit.autopsy.keywordsearch;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.solr.common.SolrInputDocument;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
class LanguageSpecificContentIndexingHelper {
enum Language {
JAPANESE
}
private final LanguageDetector languageDetector = new LanguageDetector();
Optional<Language> detectLanguageIfNeeded(Chunker.Chunk chunk) throws NoOpenCoreException {
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
if (2.2 <= indexSchemaVersion) {
return languageDetector.detect(chunk.toString()).flatMap(lang -> Optional.ofNullable(toLanguage(lang)));
} else {
return Optional.empty();
}
}
void updateLanguageSpecificFields(Map<String, Object> fields, Chunker.Chunk chunk, Language language) {
List<String> values = new ArrayList<>();
values.add(chunk.toString());
if (fields.containsKey(Server.Schema.FILE_NAME.toString())) {
values.add(fields.get(Server.Schema.FILE_NAME.toString()).toString());
}
// index the chunk to a language specific field
fields.put(Server.Schema.CONTENT_JA.toString(), values);
fields.put(Server.Schema.LANGUAGE.toString(), toFieldValue(language));
}
void indexMiniChunk(Chunker.Chunk chunk, String sourceName, Map<String, Object> fields, String baseChunkID, Language language)
throws Ingester.IngesterException {
//Make a SolrInputDocument out of the field map
SolrInputDocument updateDoc = new SolrInputDocument();
for (String key : fields.keySet()) {
updateDoc.addField(key, fields.get(key));
}
try {
updateDoc.setField(Server.Schema.ID.toString(), MiniChunks.getChunkIdString(baseChunkID));
// index the chunk to a language specific field
updateDoc.addField(Server.Schema.CONTENT_JA.toString(), chunk.toString().substring(chunk.getBaseChunkLength()));
updateDoc.addField(Server.Schema.LANGUAGE.toString(), toFieldValue(language));
TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
KeywordSearch.getServer().addDocument(updateDoc);
HealthMonitor.submitTimingMetric(metric);
} catch (KeywordSearchModuleException | NoOpenCoreException ex) {
throw new Ingester.IngesterException(
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
}
}
private static String toFieldValue(Language language) {
if (language == null) {
return null;
}
switch (language) {
case JAPANESE: return "ja";
default:
throw new IllegalStateException("Unknown language: " + language);
}
}
private Language toLanguage(LanguageDetector.Language language) {
if (language == null) {
return null;
}
switch (language) {
case JAPANESE: return Language.JAPANESE;
default:
return null;
}
}
}

View File

@ -0,0 +1,218 @@
package org.sleuthkit.autopsy.keywordsearch;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
import org.sleuthkit.autopsy.coreutils.Version;
import org.sleuthkit.datamodel.TskException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
class LanguageSpecificContentQueryHelper {
private static final List<Server.Schema> QUERY_FIELDS = new ArrayList<>();
private static final List<Server.Schema> LANGUAGE_SPECIFIC_CONTENT_FIELDS
= Collections.singletonList(Server.Schema.CONTENT_JA);
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
static {
QUERY_FIELDS.add(Server.Schema.TEXT);
QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS);
}
static class QueryResults {
List<SolrDocument> chunks = new ArrayList<>();
Map</* ID */ String, SolrDocument> miniChunks = new HashMap<>();
// objectId_chunk -> "text" -> List of previews
Map<String, Map<String, List<String>>> highlighting = new HashMap<>();
}
/**
* Make a query string from the given one by applying it to the multiple query fields
* @param queryStr escaped query string
* @return query string
*/
static String expandQueryString(final String queryStr) {
List<String> fieldQueries = new ArrayList<>();
fieldQueries.add(Server.Schema.TEXT.toString() + ":" + KeywordSearchUtil.quoteQuery(queryStr));
fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + ":" + queryStr).collect(Collectors.toList()));
return String.join(" OR ", fieldQueries);
}
static List<Server.Schema> getQueryFields() {
return QUERY_FIELDS;
}
static void updateQueryResults(QueryResults results, SolrDocument document) {
String id = (String) document.getFieldValue(Server.Schema.ID.toString());
if (MiniChunks.isMiniChunkID(id)) {
results.miniChunks.put(MiniChunks.getBaseChunkID(id), document);
} else {
results.chunks.add(document);
}
}
/**
* Get snippets
*
* @param highlight field ID -> snippets
* @return snippets of appropriate fields.
* Note that this method returns {@code Optional.empty} if the result is empty for convenience to interact with the existing code.
*/
static Optional<List<String>> getHighlights(Map<String, List<String>> highlight) {
for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
if (highlight.containsKey(field.toString())) {
return Optional.of(highlight.get(field.toString()));
}
}
return Optional.empty();
}
/**
* Merge KeywordHits from TEXT field and a language specific field
*
* Replace KeywordHits in the given {@code matches} if its chunk ID is same.
*/
static List<KeywordHit> mergeKeywordHits(List<KeywordHit> matches, Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
Map<String, KeywordHit> map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x));
List<KeywordHit> merged = new ArrayList<>();
// first, replace KeywordHit in matches
for (KeywordHit match : matches) {
String key = match.getSolrDocumentId();
if (map.containsKey(key)) {
merged.add(map.get(key));
map.remove(key);
} else {
merged.add(match);
}
}
// second, add rest of KeywordHits from queryResults
merged.addAll(map.values());
return merged;
}
static void configureTermfreqQuery(SolrQuery query, String keyword) throws KeywordSearchModuleException, NoOpenCoreException {
// make a request to Solr to parse query.
QueryParser.Result queryParserResult = QueryParser.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS);
query.addField(buildTermfreqQuery(keyword, queryParserResult));
}
static String buildTermfreqQuery(String keyword, QueryParser.Result result) {
List<String> termfreqs = new ArrayList<>();
for (Map.Entry<String, List<String>> e : result.fieldTermsMap.entrySet()) {
String field = e.getKey();
for (String term : e.getValue()) {
termfreqs.add(String.format("termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term)));
}
}
// sum of all language specific query fields.
// only one of these fields could be non-zero.
return String.format("termfreq:sum(%s)", String.join(",", termfreqs));
}
static int queryChunkTermfreq(Set<String> keywords, String contentID) throws KeywordSearchModuleException, NoOpenCoreException {
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG);
final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentID);
final String highlightQuery = keywords.stream()
.map(s -> LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.escapeLuceneQuery(s)))
.collect(Collectors.joining(" "));
q.addFilterQuery(filterQuery);
q.setQuery(highlightQuery);
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
SolrDocumentList results = response.getResults();
if (results.isEmpty()) {
return 0;
}
SolrDocument document = results.get(0);
return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
}
static int findNthIndexOf(String s, String pattern, int n) {
int found = 0;
int idx = -1;
int len = s.length();
while (idx < len && found <= n) {
idx = s.indexOf(pattern, idx + 1);
if (idx == -1) {
break;
}
found++;
}
return idx;
}
private static List<KeywordHit> findMatches(Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
List<KeywordHit> matches = new ArrayList<>();
for (SolrDocument document : queryResults.chunks) {
String docId = (String) document.getFieldValue(Server.Schema.ID.toString());
try {
int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
SolrDocument miniChunk = queryResults.miniChunks.get(docId);
if (miniChunk == null) {
// last chunk does not have mini chunk because there's no overlapped region with next one
matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
} else {
int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
if (hitCountInMiniChunk < hitCountInChunk) {
// there are at least one hit in base chunk
matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
}
}
} catch (TskException ex) {
throw new KeywordSearchModuleException(ex);
}
}
return matches;
}
/** copied from LuceneQuery and modified to use getHighlightFieldValue */
private static KeywordHit createKeywordHit(Keyword originalKeyword, Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {
/**
* Get the first snippet from the document if keyword search is
* configured to use snippets.
*/
String snippet = "";
if (KeywordSearchSettings.getShowSnippets()) {
List<String> snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null);
// list is null if there wasn't a snippet
if (snippetList != null) {
snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
}
}
return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());
}
/**
* @return Optional.empty if empty
*/
private static Optional<List<String>> getHighlightFieldValue(Map<String, List<String>> highlight) {
for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
if (highlight.containsKey(field.toString())) {
return Optional.of(highlight.get(field.toString()));
}
}
return Optional.empty();
}
}

View File

@ -134,6 +134,7 @@ class LuceneQuery implements KeywordSearchQuery {
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
boolean allResultsProcessed = false;
List<KeywordHit> matches = new ArrayList<>();
LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();
while (!allResultsProcessed) {
solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
@ -141,7 +142,18 @@ class LuceneQuery implements KeywordSearchQuery {
// objectId_chunk -> "text" -> List of previews
Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
if (2.2 <= indexSchemaVersion) {
languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());
}
for (SolrDocument resultDoc : resultList) {
if (2.2 <= indexSchemaVersion) {
Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());
if (language != null) {
LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);
}
}
try {
/*
* for each result doc, check that the first occurence of
@ -153,6 +165,11 @@ class LuceneQuery implements KeywordSearchQuery {
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
// if the document has language, it should be hit in language specific content fields. So skip here.
if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {
continue;
}
if (indexSchemaVersion < 2.0) {
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
matches.add(createKeywordtHit(highlightResponse, docId));
@ -179,9 +196,16 @@ class LuceneQuery implements KeywordSearchQuery {
cursorMark = nextCursorMark;
}
List<KeywordHit> mergedMatches;
if (2.2 <= indexSchemaVersion) {
mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);
} else {
mergedMatches = matches;
}
QueryResults results = new QueryResults(this);
//in case of single term literal query there is only 1 term
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches);
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);
return results;
}
@ -262,19 +286,25 @@ class LuceneQuery implements KeywordSearchQuery {
*
* @return
*/
private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
// Wrap the query string in quotes if this is a literal search term.
String queryStr = originalKeyword.searchTermIsLiteral()
? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
// Run the query against an optional alternative field.
if (field != null) {
//use the optional field
queryStr = field + ":" + queryStr;
q.setQuery(queryStr);
} else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(keywordStringEscaped));
} else {
q.setQuery(queryStr);
}
q.setQuery(queryStr);
q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
// Setting the sort order is necessary for cursor based paging to work.
q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
@ -283,6 +313,11 @@ class LuceneQuery implements KeywordSearchQuery {
Server.Schema.CHUNK_SIZE.toString(),
Server.Schema.CONTENT_STR.toString());
if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
q.addField(Server.Schema.LANGUAGE.toString());
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);
}
for (KeywordQueryFilter filter : filters) {
q.addFilterQuery(filter.toString());
}
@ -300,8 +335,16 @@ class LuceneQuery implements KeywordSearchQuery {
*
* @param q The SolrQuery to configure.
*/
private static void configurwQueryForHighlighting(SolrQuery q) {
q.addHighlightField(HIGHLIGHT_FIELD);
private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
if (2.2 <= indexSchemaVersion) {
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
q.addHighlightField(field.toString());
}
} else {
q.addHighlightField(HIGHLIGHT_FIELD);
}
q.setHighlightSnippets(1);
q.setHighlightFragsize(SNIPPET_LENGTH);
@ -404,7 +447,13 @@ class LuceneQuery implements KeywordSearchQuery {
if (responseHighlightID == null) {
return "";
}
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
List<String> contentHighlights;
if (2.2 <= indexSchemaVersion) {
contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
} else {
contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
}
if (contentHighlights == null) {
return "";
} else {

View File

@ -0,0 +1,18 @@
package org.sleuthkit.autopsy.keywordsearch;
class MiniChunks {
static String SUFFIX = "_mini";
static String getChunkIdString(String baseChunkID) {
return baseChunkID + SUFFIX;
}
static boolean isMiniChunkID(String chunkID) {
return chunkID.endsWith(SUFFIX);
}
static String getBaseChunkID(String miniChunkID) {
return miniChunkID.replaceFirst(SUFFIX + "$", "");
}
}

View File

@ -0,0 +1,63 @@
package org.sleuthkit.autopsy.keywordsearch;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* Parse query using Solr
*/
class QueryParser {
static class Result {
/**
* field name -> [term]
*/
Map<String, List<String>> fieldTermsMap;
}
/**
* Parse the given query string on Solr and return the result
*/
static Result parse(String query, List<Server.Schema> fields) throws KeywordSearchModuleException, NoOpenCoreException {
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(true);
q.setQuery(fields.stream().map(f -> String.format("%s:%s", f, KeywordSearchUtil.escapeLuceneQuery(query))).collect(Collectors.joining(" OR ")));
q.setRows(0);
QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
Map<String, Object> debugMap = response.getDebugMap();
String parsedQuery = debugMap.getOrDefault("parsedquery", "").toString();
Result result = new Result();
result.fieldTermsMap = getFieldTermsMap(parsedQuery);
return result;
}
static Map<String, List<String>> getFieldTermsMap(String parsedQuery) {
Map<String, List<String>> map = new HashMap<>();
for (String fieldTermStr : parsedQuery.split(" ")) {
String[] fieldTerm = fieldTermStr.split(":");
if (fieldTerm.length != 2) {
continue;
}
String field = fieldTerm[0];
String term = fieldTerm[1];
List<String> terms = map.getOrDefault(field, new ArrayList<>());
terms.add(term);
map.put(field, terms);
}
return map;
}
}

View File

@ -39,6 +39,7 @@ import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
@ -130,6 +131,18 @@ public class Server {
return "content_ws"; //NON-NLS
}
},
CONTENT_JA {
@Override
public String toString() {
return "content_ja"; //NON-NLS
}
},
LANGUAGE {
@Override
public String toString() {
return "language"; //NON-NLS
}
},
FILE_NAME {
@Override
public String toString() {
@ -175,6 +188,12 @@ public class Server {
public String toString() {
return "chunk_size"; //NON-NLS
}
},
TERMFREQ {
@Override
public String toString() {
return "termfreq"; //NON-NLS
}
}
};
@ -1635,7 +1654,8 @@ public class Server {
private int queryNumFileChunks(long contentID) throws SolrServerException, IOException {
String id = KeywordSearchUtil.escapeLuceneQuery(Long.toString(contentID));
final SolrQuery q
= new SolrQuery(Server.Schema.ID + ":" + id + Server.CHUNK_ID_SEPARATOR + "*");
= new SolrQuery(Server.Schema.ID + ":" + id + Server.CHUNK_ID_SEPARATOR + "*"
+ " NOT " + Server.Schema.ID + ":*" + MiniChunks.SUFFIX);
q.setRows(0);
return (int) query(q).getResults().getNumFound();
}

View File

@ -0,0 +1,40 @@
package org.sleuthkit.autopsy.keywordsearch;
import org.junit.Test;
import java.util.Arrays;
import java.util.HashMap;
import static org.junit.Assert.assertEquals;
public class LanguageSpecificContentQueryHelperTest {
@Test
public void makeQueryString() {
assertEquals("text:query OR content_ja:query", LanguageSpecificContentQueryHelper.expandQueryString("query"));
}
@Test
public void findNthIndexOf() {
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "_", 0));
assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 0));
assertEquals(2, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 1));
assertEquals(3, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 2));
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 3));
assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "", 0));
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("", "A", 0));
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", -1));
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 999));
}
@Test
public void buildTermfreqQuery() {
QueryParser.Result result = new QueryParser.Result();
result.fieldTermsMap = new HashMap<>();
result.fieldTermsMap.put("field1", Arrays.asList("term1"));
result.fieldTermsMap.put("field2", Arrays.asList("term1", "term2"));
assertEquals(
"termfreq:sum(termfreq(\"field1\",\"term1\"),termfreq(\"field1\",\"term1\"),termfreq(\"field1\",\"term1\"))",
LanguageSpecificContentQueryHelper.buildTermfreqQuery("query", result));
}
}

View File

@ -0,0 +1,25 @@
package org.sleuthkit.autopsy.keywordsearch;
import org.junit.Assert;
import org.junit.Test;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class MiniChunksTest {
@Test
public void isMiniChunkID() {
assertTrue(MiniChunks.isMiniChunkID("1_1_mini"));
assertFalse(MiniChunks.isMiniChunkID("1_1"));
assertFalse(MiniChunks.isMiniChunkID("1"));
}
@Test
public void getBaseChunkID() {
Assert.assertEquals("1_1", MiniChunks.getBaseChunkID("1_1_mini"));
Assert.assertEquals("1_1", MiniChunks.getBaseChunkID("1_1"));
Assert.assertEquals("1", MiniChunks.getBaseChunkID("1"));
}
}

View File

@ -0,0 +1,20 @@
package org.sleuthkit.autopsy.keywordsearch;
import org.junit.Test;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
public class QueryParserTest {
@Test
public void getFieldTermsMap() {
Map<String, List<String>> map = QueryParser.getFieldTermsMap("content_ja:雨 content_ja:降る");
List<String> terms = map.get("content_ja");
assertEquals(2, terms.size());
assertEquals("", terms.get(0));
assertEquals("降る", terms.get(1));
}
}