From bbaf25453f105797c0392a2d5f2b606414af41fc Mon Sep 17 00:00:00 2001 From: Richard Cordovano Date: Fri, 4 Oct 2019 19:20:09 -0400 Subject: [PATCH] Restore keyword search files --- .../AutopsyConfig/conf/lang/stoptags_ja.txt | 420 ++++++++++++++++++ .../AutopsyConfig/conf/lang/stopwords_ja.txt | 127 ++++++ .../autopsy/keywordsearch/Language.java | 46 ++ .../keywordsearch/LanguageDetector.java | 60 +++ ...LanguageSpecificContentIndexingHelper.java | 85 ++++ .../LanguageSpecificContentQueryHelper.java | 248 +++++++++++ .../keywordsearch/MiniChunkHelper.java | 41 ++ .../keywordsearch/QueryTermHelper.java | 95 ++++ ...anguageSpecificContentQueryHelperTest.java | 59 +++ .../keywordsearch/MiniChunkHelperTest.java | 46 ++ 10 files changed, 1227 insertions(+) create mode 100755 KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt create mode 100755 KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Language.java create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelper.java create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryTermHelper.java create mode 100755 KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java create mode 100755 KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelperTest.java diff --git a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt new file mode 100755 index 0000000000..71b750845e --- /dev/null +++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt new file mode 100755 index 0000000000..d4321be6b1 --- /dev/null +++ b/KeywordSearch/solr/solr/configsets/AutopsyConfig/conf/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Language.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Language.java new file mode 100755 index 0000000000..5fb1f859d3 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Language.java @@ -0,0 +1,46 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import java.util.Arrays; +import java.util.Optional; + +/** + * Language. + * + * Contents which are detected to have these languages should be indexed to a corresponding language-specific field + * such as content_ja. + */ +public enum Language { + JAPANESE("ja"); + + private String value; + + String getValue() { + return value; + } + + static Optional fromValue(String value) { + return Arrays.stream(Language.values()).filter(x -> x.value.equals(value)).findFirst(); + } + + Language(String value) { + this.value = value; + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java new file mode 100755 index 0000000000..f527a2fc0e --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageDetector.java @@ -0,0 +1,60 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObject; +import com.optimaize.langdetect.text.TextObjectFactory; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Optional; + +/** + * Detects the language of the given contents. Only languages which should be indexed to a corresponding + * language-specific field are detected. + */ +class LanguageDetector { + + private com.optimaize.langdetect.LanguageDetector impl; + private TextObjectFactory textObjectFactory; + + LanguageDetector() { + try { + impl = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(new LanguageProfileReader().readAllBuiltIn()) + .build(); + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } catch (IOException e) { + // The IOException here could occur when failing to read the language profiles from the classpath. + // That can be considered to be a severe IO problem. Nothing can be done here. + throw new UncheckedIOException(e); + } + } + + Optional detect(String text) { + TextObject textObject = textObjectFactory.forText(text); + Optional localeOpt = impl.detect(textObject).transform(Optional::of).or(Optional.empty()); + return localeOpt.map(LdLocale::getLanguage).flatMap(Language::fromValue); + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java new file mode 100755 index 0000000000..d0988c83f3 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentIndexingHelper.java @@ -0,0 +1,85 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.solr.common.SolrInputDocument; +import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.healthmonitor.HealthMonitor; +import org.sleuthkit.autopsy.healthmonitor.TimingMetric; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +/** + * A helper class to support indexing language-specific fields. + */ +class LanguageSpecificContentIndexingHelper { + + private final LanguageDetector languageDetector = new LanguageDetector(); + + Optional detectLanguageIfNeeded(Chunker.Chunk chunk) throws NoOpenCoreException { + double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion()); + if (2.2 <= indexSchemaVersion) { + return languageDetector.detect(chunk.toString()); + } else { + return Optional.empty(); + } + } + + void updateLanguageSpecificFields(Map fields, Chunker.Chunk chunk, Language language) { + List values = new ArrayList<>(); + values.add(chunk.toString()); + if (fields.containsKey(Server.Schema.FILE_NAME.toString())) { + values.add(fields.get(Server.Schema.FILE_NAME.toString()).toString()); + } + + // index the chunk to a language specific field + fields.put(Server.Schema.CONTENT_JA.toString(), values); + fields.put(Server.Schema.LANGUAGE.toString(), language.getValue()); + } + + void indexMiniChunk(Chunker.Chunk chunk, String sourceName, Map fields, String baseChunkID, Language language) + throws Ingester.IngesterException { + //Make a SolrInputDocument out of the field map + SolrInputDocument updateDoc = new SolrInputDocument(); + for (String key : fields.keySet()) { + updateDoc.addField(key, fields.get(key)); + } + + try { + updateDoc.setField(Server.Schema.ID.toString(), MiniChunkHelper.getChunkIdString(baseChunkID)); + + // index the chunk to a language specific field + updateDoc.addField(Server.Schema.CONTENT_JA.toString(), chunk.toString().substring(chunk.getBaseChunkLength())); + updateDoc.addField(Server.Schema.LANGUAGE.toString(), language.getValue()); + + TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk"); + + KeywordSearch.getServer().addDocument(updateDoc); + HealthMonitor.submitTimingMetric(metric); + + } catch (KeywordSearchModuleException | NoOpenCoreException ex) { + throw new Ingester.IngesterException( + NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex); + } + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java new file mode 100755 index 0000000000..a3ed8a7876 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelper.java @@ -0,0 +1,248 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.sleuthkit.autopsy.coreutils.EscapeUtil; +import org.sleuthkit.autopsy.coreutils.Version; +import org.sleuthkit.datamodel.TskException; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * A helper class to support querying documents which have language-specific fields. + */ +final class LanguageSpecificContentQueryHelper { + + private LanguageSpecificContentQueryHelper() {} + + private static final List QUERY_FIELDS = new ArrayList<>(); + private static final List LANGUAGE_SPECIFIC_CONTENT_FIELDS + = Collections.singletonList(Server.Schema.CONTENT_JA); + private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT); + + static { + QUERY_FIELDS.add(Server.Schema.TEXT); + QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS); + } + + /** + * Holds query response for later processes related to language-specific fields + */ + static class QueryResults { + List chunks = new ArrayList<>(); + Map miniChunks = new HashMap<>(); + // objectId_chunk -> "text" -> List of previews + Map>> highlighting = new HashMap<>(); + } + + /** + * Make a query string from the given one by applying it to the multiple query fields + * + * @param queryStr escaped query string + * @return query string + */ + static String expandQueryString(final String queryStr) { + List fieldQueries = new ArrayList<>(); + fieldQueries.add(Server.Schema.TEXT.toString() + ":" + queryStr); + fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + ":" + queryStr).collect(Collectors.toList())); + return String.join(" OR ", fieldQueries); + } + + static List getQueryFields() { + return QUERY_FIELDS; + } + + static void updateQueryResults(QueryResults results, SolrDocument document) { + String id = (String) document.getFieldValue(Server.Schema.ID.toString()); + if (MiniChunkHelper.isMiniChunkID(id)) { + results.miniChunks.put(MiniChunkHelper.getBaseChunkID(id), document); + } else { + results.chunks.add(document); + } + } + + /** + * Get snippets + * + * @param highlight field ID -> snippets + * @return snippets of appropriate fields. + * Note that this method returns {@code Optional.empty} if the result is empty for convenience to interact with the existing code. + */ + static Optional> getHighlights(Map> highlight) { + for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) { + if (highlight.containsKey(field.toString())) { + return Optional.of(highlight.get(field.toString())); + } + } + return Optional.empty(); + } + + /** + * Merge KeywordHits from TEXT field and a language specific field + * + * Replace KeywordHits in the given {@code matches} if its chunk ID is same. + */ + static List mergeKeywordHits(List matches, Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException { + Map map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x)); + List merged = new ArrayList<>(); + + // first, replace KeywordHit in matches + for (KeywordHit match : matches) { + String key = match.getSolrDocumentId(); + if (map.containsKey(key)) { + merged.add(map.get(key)); + map.remove(key); + } else { + merged.add(match); + } + } + // second, add rest of KeywordHits from queryResults + merged.addAll(map.values()); + + return merged; + } + + static void configureTermfreqQuery(SolrQuery query, String keyword) throws KeywordSearchModuleException, NoOpenCoreException { + // make a request to Solr to parse query. + QueryTermHelper.Result queryParserResult = QueryTermHelper.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS); + query.addField(buildTermfreqQuery(keyword, queryParserResult)); + } + + static String buildTermfreqQuery(String keyword, QueryTermHelper.Result result) { + List termfreqs = new ArrayList<>(); + for (Map.Entry> e : result.fieldTermsMap.entrySet()) { + String field = e.getKey(); + for (String term : e.getValue()) { + termfreqs.add(String.format("termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term))); + } + } + + // sum of all language specific query fields. + // only one of these fields could be non-zero. + return String.format("termfreq:sum(%s)", String.join(",", termfreqs)); + } + + static int queryChunkTermfreq(Set keywords, String contentID) throws KeywordSearchModuleException, NoOpenCoreException { + SolrQuery q = new SolrQuery(); + q.setShowDebugInfo(DEBUG); + + final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentID); + final String highlightQuery = keywords.stream() + .map(s -> LanguageSpecificContentQueryHelper.expandQueryString( + KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s)))) + .collect(Collectors.joining(" ")); + + q.addFilterQuery(filterQuery); + q.setQuery(highlightQuery); + LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next()); + + QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST); + SolrDocumentList results = response.getResults(); + if (results.isEmpty()) { + return 0; + } + + SolrDocument document = results.get(0); + return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue(); + } + + static int findNthIndexOf(String s, String pattern, int n) { + int found = 0; + int idx = -1; + int len = s.length(); + while (idx < len && found <= n) { + idx = s.indexOf(pattern, idx + 1); + if (idx == -1) { + break; + } + found++; + } + + return idx; + } + + private static List findMatches(Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException { + List matches = new ArrayList<>(); + for (SolrDocument document : queryResults.chunks) { + String docId = (String) document.getFieldValue(Server.Schema.ID.toString()); + + try { + int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue(); + SolrDocument miniChunk = queryResults.miniChunks.get(docId); + if (miniChunk == null) { + // last chunk does not have mini chunk because there's no overlapped region with next one + matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId)); + } else { + int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue(); + if (hitCountInMiniChunk < hitCountInChunk) { + // there are at least one hit in base chunk + matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId)); + } + } + } catch (TskException ex) { + throw new KeywordSearchModuleException(ex); + } + } + return matches; + } + + /** + * copied from LuceneQuery and modified to use getHighlightFieldValue + */ + private static KeywordHit createKeywordHit(Keyword originalKeyword, Map>> highlightResponse, String docId) throws TskException { + /** + * Get the first snippet from the document if keyword search is + * configured to use snippets. + */ + String snippet = ""; + if (KeywordSearchSettings.getShowSnippets()) { + List snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null); + // list is null if there wasn't a snippet + if (snippetList != null) { + snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim(); + } + } + + return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm()); + } + + /** + * @return Optional.empty if empty + */ + private static Optional> getHighlightFieldValue(Map> highlight) { + for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) { + if (highlight.containsKey(field.toString())) { + return Optional.of(highlight.get(field.toString())); + } + } + return Optional.empty(); + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelper.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelper.java new file mode 100755 index 0000000000..9e958587cd --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelper.java @@ -0,0 +1,41 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +/** + * Mini-chunk related methods. + */ +final class MiniChunkHelper { + + private MiniChunkHelper() {} + + static String SUFFIX = "_mini"; + + static String getChunkIdString(String baseChunkID) { + return baseChunkID + SUFFIX; + } + + static boolean isMiniChunkID(String chunkID) { + return chunkID.endsWith(SUFFIX); + } + + static String getBaseChunkID(String miniChunkID) { + return miniChunkID.replaceFirst(SUFFIX + "$", ""); + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryTermHelper.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryTermHelper.java new file mode 100755 index 0000000000..39a050c47f --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/QueryTermHelper.java @@ -0,0 +1,95 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.request.FieldAnalysisRequest; +import org.apache.solr.client.solrj.response.AnalysisResponseBase; +import org.apache.solr.client.solrj.response.FieldAnalysisResponse; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Get terms from query using Solr. + * + * This class is used to find matched terms from query results. + */ +final class QueryTermHelper { + + private QueryTermHelper() {} + + /** + * Result of {@link #parse} method + */ + static class Result { + /** + * field name -> [term] + */ + final Map> fieldTermsMap = new HashMap<>(); + } + + /** + * Parse the given query string on Solr and return the result + * + * @param query query to parse + * @param fields field names to use for parsing + */ + static Result parse(String query, List fields) throws KeywordSearchModuleException, NoOpenCoreException { + Server server = KeywordSearch.getServer(); + + FieldAnalysisRequest request = new FieldAnalysisRequest(); + for (Server.Schema field : fields) { + request.addFieldName(field.toString()); + } + // FieldAnalysisRequest requires to set its field value property, + // while the corresponding analysis.fieldvalue parameter is not needed in the API. + // Setting an empty value does not effect on the result. + request.setFieldValue(""); + request.setQuery(query); + + FieldAnalysisResponse response = new FieldAnalysisResponse(); + try { + response.setResponse(server.request(request)); + } catch (SolrServerException e) { + throw new KeywordSearchModuleException(e); + } + + Result result = new Result(); + for (Map.Entry entry : response.getAllFieldNameAnalysis()) { + Iterator it = entry.getValue().getQueryPhases().iterator(); + + // The last phase is the one which is used in the search process. + AnalysisResponseBase.AnalysisPhase lastPhase = null; + while (it.hasNext()) { + lastPhase = it.next(); + } + + if (lastPhase != null) { + List tokens = lastPhase.getTokens().stream().map(AnalysisResponseBase.TokenInfo::getText).collect(Collectors.toList()); + result.fieldTermsMap.put(entry.getKey(), tokens); + } + } + + return result; + } +} diff --git a/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java new file mode 100755 index 0000000000..d8c876592e --- /dev/null +++ b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/LanguageSpecificContentQueryHelperTest.java @@ -0,0 +1,59 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import org.junit.Test; + +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; + +/** + * tests for LanguageSpecificContentQueryHelper + */ +public class LanguageSpecificContentQueryHelperTest { + + @Test + public void makeQueryString() { + assertEquals("text:query OR content_ja:query", LanguageSpecificContentQueryHelper.expandQueryString("query")); + } + + @Test + public void findNthIndexOf() { + assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "_", 0)); + assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 0)); + assertEquals(2, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 1)); + assertEquals(3, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 2)); + assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 3)); + assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "", 0)); + assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("", "A", 0)); + assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", -1)); + assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 999)); + } + + @Test + public void buildTermfreqQuery() { + QueryTermHelper.Result result = new QueryTermHelper.Result(); + result.fieldTermsMap.put("field1", Arrays.asList("term1")); + result.fieldTermsMap.put("field2", Arrays.asList("term1", "term2")); + assertEquals( + "termfreq:sum(termfreq(\"field1\",\"term1\"),termfreq(\"field2\",\"term1\"),termfreq(\"field2\",\"term2\"))", + LanguageSpecificContentQueryHelper.buildTermfreqQuery("query", result)); + } +} diff --git a/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelperTest.java b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelperTest.java new file mode 100755 index 0000000000..27336b8297 --- /dev/null +++ b/KeywordSearch/test/unit/src/org/sleuthkit/autopsy/keywordsearch/MiniChunkHelperTest.java @@ -0,0 +1,46 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2019 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import org.junit.Assert; +import org.junit.Test; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * tests for MiniChunkHelper + */ +public class MiniChunkHelperTest { + + @Test + public void isMiniChunkID() { + assertTrue(MiniChunkHelper.isMiniChunkID("1_1_mini")); + assertFalse(MiniChunkHelper.isMiniChunkID("1_1")); + assertFalse(MiniChunkHelper.isMiniChunkID("1")); + } + + @Test + public void getBaseChunkID() { + Assert.assertEquals("1_1", MiniChunkHelper.getBaseChunkID("1_1_mini")); + Assert.assertEquals("1_1", MiniChunkHelper.getBaseChunkID("1_1")); + Assert.assertEquals("1", MiniChunkHelper.getBaseChunkID("1")); + } + +} \ No newline at end of file