mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
Revert "5537-file-encryption-handle-asec-files-better"
This commit is contained in:
parent
25aedbbb02
commit
ee03b898cb
@ -1,7 +1,6 @@
|
||||
EncryptionDetectionDataSourceIngestModule.artifactComment.bitlocker=Bitlocker encryption detected.
|
||||
EncryptionDetectionDataSourceIngestModule.artifactComment.suspected=Suspected encryption due to high entropy (%f).
|
||||
EncryptionDetectionDataSourceIngestModule.processing.message=Checking image for encryption.
|
||||
EncryptionDetectionFileIngestModule.artifactComment.location=High entropy and known location/extension.
|
||||
EncryptionDetectionFileIngestModule.artifactComment.password=Password protection detected.
|
||||
EncryptionDetectionFileIngestModule.artifactComment.suspected=Suspected encryption due to high entropy (%f).
|
||||
EncryptionDetectionFileIngestModule.getDesc.text=Looks for files with the specified minimum entropy.
|
||||
|
@ -29,8 +29,6 @@ import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.BufferUnderflowException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.tika.exception.EncryptedDocumentException;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
@ -78,11 +76,6 @@ final class EncryptionDetectionFileIngestModule extends FileIngestModuleAdapter
|
||||
|
||||
private static final String[] FILE_IGNORE_LIST = {"hiberfile.sys", "pagefile.sys"};
|
||||
|
||||
/**
|
||||
* This maps file locations to file extensions that are known to be encrypted
|
||||
*/
|
||||
private static final Map<String, String> knownEncryptedLocationExtensions = createLocationExtensionMap();
|
||||
|
||||
private final IngestServices services = IngestServices.getInstance();
|
||||
private final Logger logger = services.getLogger(EncryptionDetectionModuleFactory.getModuleName());
|
||||
private FileTypeDetector fileTypeDetector;
|
||||
@ -126,7 +119,6 @@ final class EncryptionDetectionFileIngestModule extends FileIngestModuleAdapter
|
||||
|
||||
@Messages({
|
||||
"EncryptionDetectionFileIngestModule.artifactComment.password=Password protection detected.",
|
||||
"EncryptionDetectionFileIngestModule.artifactComment.location=High entropy and known location/extension.",
|
||||
"EncryptionDetectionFileIngestModule.artifactComment.suspected=Suspected encryption due to high entropy (%f)."
|
||||
})
|
||||
@Override
|
||||
@ -163,9 +155,6 @@ final class EncryptionDetectionFileIngestModule extends FileIngestModuleAdapter
|
||||
*/
|
||||
String mimeType = fileTypeDetector.getMIMEType(file);
|
||||
if (mimeType.equals("application/octet-stream") && isFileEncryptionSuspected(file)) {
|
||||
if (checkFileLocationExtension(file)) {
|
||||
return flagFile(file, BlackboardArtifact.ARTIFACT_TYPE.TSK_ENCRYPTION_DETECTED, Bundle.EncryptionDetectionFileIngestModule_artifactComment_location());
|
||||
}
|
||||
return flagFile(file, BlackboardArtifact.ARTIFACT_TYPE.TSK_ENCRYPTION_SUSPECTED,
|
||||
String.format(Bundle.EncryptionDetectionFileIngestModule_artifactComment_suspected(), calculatedEntropy));
|
||||
} else if (isFilePasswordProtected(file)) {
|
||||
@ -417,36 +406,4 @@ final class EncryptionDetectionFileIngestModule extends FileIngestModuleAdapter
|
||||
|
||||
return possiblyEncrypted;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method checks if the AbstractFile input is in a location that is
|
||||
* known to hold encrypted files. It must meet the requirements and location
|
||||
* of known encrypted file(s)
|
||||
*
|
||||
* @param file AbstractFile to be checked.
|
||||
*
|
||||
* @return True if file extension and location match known values.
|
||||
*
|
||||
*/
|
||||
private boolean checkFileLocationExtension(AbstractFile file) {
|
||||
String filePath = file.getParentPath().replace("/", "");
|
||||
if ((knownEncryptedLocationExtensions.containsKey(filePath))
|
||||
&& (knownEncryptedLocationExtensions.get(filePath).equals(file.getNameExtension())))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* This method creates the map of paths and extensions that are known to
|
||||
* have encrypted files
|
||||
*
|
||||
* @return Map of path and extension of files
|
||||
*/
|
||||
private static Map<String, String> createLocationExtensionMap() {
|
||||
Map<String, String> locationExtensionMap = new HashMap<String, String>();
|
||||
locationExtensionMap.put(".android_secure", "asec");
|
||||
return locationExtensionMap;
|
||||
}
|
||||
}
|
||||
|
@ -21,7 +21,6 @@
|
||||
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="4.9.1"/>
|
||||
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
|
||||
<dependency conf="autopsy->*" org="commons-validator" name="commons-validator" rev="1.5.1"/>
|
||||
<dependency conf="autopsy->*" org="com.optimaize.languagedetector" name="language-detector" rev="0.6"/>
|
||||
<!-- Exclude the version of cxf-rt-rs-client from Tika 1.20, one of its depedencies breaks Ivy -->
|
||||
<dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="1.20">
|
||||
<exclude module="cxf-rt-rs-client"/>
|
||||
|
@ -29,7 +29,6 @@ file.reference.jericho-html-3.3.jar=release/modules/ext/jericho-html-3.3.jar
|
||||
file.reference.joda-time-2.2.jar=release/modules/ext/joda-time-2.2.jar
|
||||
file.reference.json-simple-1.1.1.jar=release/modules/ext/json-simple-1.1.1.jar
|
||||
file.reference.juniversalchardet-1.0.3.jar=release/modules/ext/juniversalchardet-1.0.3.jar
|
||||
file.reference.language-detector-0.6.jar=release/modules/ext/language-detector-0.6.jar
|
||||
file.reference.libsvm-3.1.jar=release/modules/ext/libsvm-3.1.jar
|
||||
file.reference.log4j-1.2.17.jar=release/modules/ext/log4j-1.2.17.jar
|
||||
file.reference.lucene-core-4.0.0.jar=release/modules/ext/lucene-core-4.0.0.jar
|
||||
|
@ -467,10 +467,6 @@
|
||||
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
|
||||
<binary-origin>release/modules/ext/vorbis-java-tika-0.8.jar</binary-origin>
|
||||
</class-path-extension>
|
||||
<class-path-extension>
|
||||
<runtime-relative-path>ext/language-detector-0.6.jar</runtime-relative-path>
|
||||
<binary-origin>release/modules/ext/language-detector-0.6.jar</binary-origin>
|
||||
</class-path-extension>
|
||||
</data>
|
||||
</configuration>
|
||||
</project>
|
||||
|
@ -1,420 +0,0 @@
|
||||
#
|
||||
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
|
||||
#
|
||||
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||
# file are removed from the token stream.
|
||||
#
|
||||
# Set your own stoptags by uncommenting the lines below. Note that comments are
|
||||
# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists,
|
||||
# etc. that can be useful for building you own stoptag set.
|
||||
#
|
||||
# The entire possible tagset is provided below for convenience.
|
||||
#
|
||||
#####
|
||||
# noun: unclassified nouns
|
||||
#名詞
|
||||
#
|
||||
# noun-common: Common nouns or nouns where the sub-classification is undefined
|
||||
#名詞-一般
|
||||
#
|
||||
# noun-proper: Proper nouns where the sub-classification is undefined
|
||||
#名詞-固有名詞
|
||||
#
|
||||
# noun-proper-misc: miscellaneous proper nouns
|
||||
#名詞-固有名詞-一般
|
||||
#
|
||||
# noun-proper-person: Personal names where the sub-classification is undefined
|
||||
#名詞-固有名詞-人名
|
||||
#
|
||||
# noun-proper-person-misc: names that cannot be divided into surname and
|
||||
# given name; foreign names; names where the surname or given name is unknown.
|
||||
# e.g. お市の方
|
||||
#名詞-固有名詞-人名-一般
|
||||
#
|
||||
# noun-proper-person-surname: Mainly Japanese surnames.
|
||||
# e.g. 山田
|
||||
#名詞-固有名詞-人名-姓
|
||||
#
|
||||
# noun-proper-person-given_name: Mainly Japanese given names.
|
||||
# e.g. 太郎
|
||||
#名詞-固有名詞-人名-名
|
||||
#
|
||||
# noun-proper-organization: Names representing organizations.
|
||||
# e.g. 通産省, NHK
|
||||
#名詞-固有名詞-組織
|
||||
#
|
||||
# noun-proper-place: Place names where the sub-classification is undefined
|
||||
#名詞-固有名詞-地域
|
||||
#
|
||||
# noun-proper-place-misc: Place names excluding countries.
|
||||
# e.g. アジア, バルセロナ, 京都
|
||||
#名詞-固有名詞-地域-一般
|
||||
#
|
||||
# noun-proper-place-country: Country names.
|
||||
# e.g. 日本, オーストラリア
|
||||
#名詞-固有名詞-地域-国
|
||||
#
|
||||
# noun-pronoun: Pronouns where the sub-classification is undefined
|
||||
#名詞-代名詞
|
||||
#
|
||||
# noun-pronoun-misc: miscellaneous pronouns:
|
||||
# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
|
||||
#名詞-代名詞-一般
|
||||
#
|
||||
# noun-pronoun-contraction: Spoken language contraction made by combining a
|
||||
# pronoun and the particle 'wa'.
|
||||
# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ
|
||||
#名詞-代名詞-縮約
|
||||
#
|
||||
# noun-adverbial: Temporal nouns such as names of days or months that behave
|
||||
# like adverbs. Nouns that represent amount or ratios and can be used adverbially,
|
||||
# e.g. 金曜, 一月, 午後, 少量
|
||||
#名詞-副詞可能
|
||||
#
|
||||
# noun-verbal: Nouns that take arguments with case and can appear followed by
|
||||
# 'suru' and related verbs (する, できる, なさる, くださる)
|
||||
# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
|
||||
#名詞-サ変接続
|
||||
#
|
||||
# noun-adjective-base: The base form of adjectives, words that appear before な ("na")
|
||||
# e.g. 健康, 安易, 駄目, だめ
|
||||
#名詞-形容動詞語幹
|
||||
#
|
||||
# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
|
||||
# e.g. 0, 1, 2, 何, 数, 幾
|
||||
#名詞-数
|
||||
#
|
||||
# noun-affix: noun affixes where the sub-classification is undefined
|
||||
#名詞-非自立
|
||||
#
|
||||
# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that
|
||||
# attach to the base form of inflectional words, words that cannot be classified
|
||||
# into any of the other categories below. This category includes indefinite nouns.
|
||||
# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第,
|
||||
# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み,
|
||||
# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
|
||||
# わり, 割り, 割, ん-口語/, もん-口語/
|
||||
#名詞-非自立-一般
|
||||
#
|
||||
# noun-affix-adverbial: noun affixes that that can behave as adverbs.
|
||||
# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ,
|
||||
# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか,
|
||||
# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所,
|
||||
# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま,
|
||||
# 儘, 侭, みぎり, 矢先
|
||||
#名詞-非自立-副詞可能
|
||||
#
|
||||
# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars
|
||||
# with the stem よう(だ) ("you(da)").
|
||||
# e.g. よう, やう, 様 (よう)
|
||||
#名詞-非自立-助動詞語幹
|
||||
#
|
||||
# noun-affix-adjective-base: noun affixes that can connect to the indeclinable
|
||||
# connection form な (aux "da").
|
||||
# e.g. みたい, ふう
|
||||
#名詞-非自立-形容動詞語幹
|
||||
#
|
||||
# noun-special: special nouns where the sub-classification is undefined.
|
||||
#名詞-特殊
|
||||
#
|
||||
# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is
|
||||
# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base
|
||||
# form of inflectional words.
|
||||
# e.g. そう
|
||||
#名詞-特殊-助動詞語幹
|
||||
#
|
||||
# noun-suffix: noun suffixes where the sub-classification is undefined.
|
||||
#名詞-接尾
|
||||
#
|
||||
# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect
|
||||
# to ガル or タイ and can combine into compound nouns, words that cannot be classified into
|
||||
# any of the other categories below. In general, this category is more inclusive than
|
||||
# 接尾語 ("suffix") and is usually the last element in a compound noun.
|
||||
# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み,
|
||||
# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
|
||||
#名詞-接尾-一般
|
||||
#
|
||||
# noun-suffix-person: Suffixes that form nouns and attach to person names more often
|
||||
# than other nouns.
|
||||
# e.g. 君, 様, 著
|
||||
#名詞-接尾-人名
|
||||
#
|
||||
# noun-suffix-place: Suffixes that form nouns and attach to place names more often
|
||||
# than other nouns.
|
||||
# e.g. 町, 市, 県
|
||||
#名詞-接尾-地域
|
||||
#
|
||||
# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that
|
||||
# can appear before スル ("suru").
|
||||
# e.g. 化, 視, 分け, 入り, 落ち, 買い
|
||||
#名詞-接尾-サ変接続
|
||||
#
|
||||
# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions,
|
||||
# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the
|
||||
# conjunctive form of inflectional words.
|
||||
# e.g. そう
|
||||
#名詞-接尾-助動詞語幹
|
||||
#
|
||||
# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive
|
||||
# form of inflectional words and appear before the copula だ ("da").
|
||||
# e.g. 的, げ, がち
|
||||
#名詞-接尾-形容動詞語幹
|
||||
#
|
||||
# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
|
||||
# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ)
|
||||
#名詞-接尾-副詞可能
|
||||
#
|
||||
# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category
|
||||
# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach
|
||||
# to numbers.
|
||||
# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半
|
||||
#名詞-接尾-助数詞
|
||||
#
|
||||
# noun-suffix-special: Special suffixes that mainly attach to inflecting words.
|
||||
# e.g. (楽し) さ, (考え) 方
|
||||
#名詞-接尾-特殊
|
||||
#
|
||||
# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words
|
||||
# together.
|
||||
# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦)
|
||||
#名詞-接続詞的
|
||||
#
|
||||
# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are
|
||||
# semantically verb-like.
|
||||
# e.g. ごらん, ご覧, 御覧, 頂戴
|
||||
#名詞-動詞非自立的
|
||||
#
|
||||
# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry,
|
||||
# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation")
|
||||
# is いわく ("iwaku").
|
||||
#名詞-引用文字列
|
||||
#
|
||||
# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and
|
||||
# behave like an adjective.
|
||||
# e.g. 申し訳, 仕方, とんでも, 違い
|
||||
#名詞-ナイ形容詞語幹
|
||||
#
|
||||
#####
|
||||
# prefix: unclassified prefixes
|
||||
#接頭詞
|
||||
#
|
||||
# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
|
||||
# excluding numerical expressions.
|
||||
# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派)
|
||||
#接頭詞-名詞接続
|
||||
#
|
||||
# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
|
||||
# in conjunctive form followed by なる/なさる/くださる.
|
||||
# e.g. お (読みなさい), お (座り)
|
||||
#接頭詞-動詞接続
|
||||
#
|
||||
# prefix-adjectival: Prefixes that attach to adjectives.
|
||||
# e.g. お (寒いですねえ), バカ (でかい)
|
||||
#接頭詞-形容詞接続
|
||||
#
|
||||
# prefix-numerical: Prefixes that attach to numerical expressions.
|
||||
# e.g. 約, およそ, 毎時
|
||||
#接頭詞-数接続
|
||||
#
|
||||
#####
|
||||
# verb: unclassified verbs
|
||||
#動詞
|
||||
#
|
||||
# verb-main:
|
||||
#動詞-自立
|
||||
#
|
||||
# verb-auxiliary:
|
||||
#動詞-非自立
|
||||
#
|
||||
# verb-suffix:
|
||||
#動詞-接尾
|
||||
#
|
||||
#####
|
||||
# adjective: unclassified adjectives
|
||||
#形容詞
|
||||
#
|
||||
# adjective-main:
|
||||
#形容詞-自立
|
||||
#
|
||||
# adjective-auxiliary:
|
||||
#形容詞-非自立
|
||||
#
|
||||
# adjective-suffix:
|
||||
#形容詞-接尾
|
||||
#
|
||||
#####
|
||||
# adverb: unclassified adverbs
|
||||
#副詞
|
||||
#
|
||||
# adverb-misc: Words that can be segmented into one unit and where adnominal
|
||||
# modification is not possible.
|
||||
# e.g. あいかわらず, 多分
|
||||
#副詞-一般
|
||||
#
|
||||
# adverb-particle_conjunction: Adverbs that can be followed by の, は, に,
|
||||
# な, する, だ, etc.
|
||||
# e.g. こんなに, そんなに, あんなに, なにか, なんでも
|
||||
#副詞-助詞類接続
|
||||
#
|
||||
#####
|
||||
# adnominal: Words that only have noun-modifying forms.
|
||||
# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう,
|
||||
# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした,
|
||||
# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き
|
||||
#連体詞
|
||||
#
|
||||
#####
|
||||
# conjunction: Conjunctions that can occur independently.
|
||||
# e.g. が, けれども, そして, じゃあ, それどころか
|
||||
接続詞
|
||||
#
|
||||
#####
|
||||
# particle: unclassified particles.
|
||||
助詞
|
||||
#
|
||||
# particle-case: case particles where the subclassification is undefined.
|
||||
助詞-格助詞
|
||||
#
|
||||
# particle-case-misc: Case particles.
|
||||
# e.g. から, が, で, と, に, へ, より, を, の, にて
|
||||
助詞-格助詞-一般
|
||||
#
|
||||
# particle-case-quote: the "to" that appears after nouns, a person’s speech,
|
||||
# quotation marks, expressions of decisions from a meeting, reasons, judgements,
|
||||
# conjectures, etc.
|
||||
# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...)
|
||||
助詞-格助詞-引用
|
||||
#
|
||||
# particle-case-compound: Compounds of particles and verbs that mainly behave
|
||||
# like case particles.
|
||||
# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って,
|
||||
# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける,
|
||||
# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し,
|
||||
# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして,
|
||||
# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって,
|
||||
# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る,
|
||||
# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる,
|
||||
# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ
|
||||
助詞-格助詞-連語
|
||||
#
|
||||
# particle-conjunctive:
|
||||
# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども,
|
||||
# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/,
|
||||
# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/
|
||||
助詞-接続助詞
|
||||
#
|
||||
# particle-dependency:
|
||||
# e.g. こそ, さえ, しか, すら, は, も, ぞ
|
||||
助詞-係助詞
|
||||
#
|
||||
# particle-adverbial:
|
||||
# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/,
|
||||
# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/,
|
||||
# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに,
|
||||
# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/,
|
||||
# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」)
|
||||
助詞-副助詞
|
||||
#
|
||||
# particle-interjective: particles with interjective grammatical roles.
|
||||
# e.g. (松島) や
|
||||
助詞-間投助詞
|
||||
#
|
||||
# particle-coordinate:
|
||||
# e.g. と, たり, だの, だり, とか, なり, や, やら
|
||||
助詞-並立助詞
|
||||
#
|
||||
# particle-final:
|
||||
# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ,
|
||||
# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/
|
||||
助詞-終助詞
|
||||
#
|
||||
# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is
|
||||
# adverbial, conjunctive, or sentence final. For example:
|
||||
# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」
|
||||
# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」
|
||||
# 「(祈りが届いたせい) か (, 試験に合格した.)」
|
||||
# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」
|
||||
# e.g. か
|
||||
助詞-副助詞/並立助詞/終助詞
|
||||
#
|
||||
# particle-adnominalizer: The "no" that attaches to nouns and modifies
|
||||
# non-inflectional words.
|
||||
助詞-連体化
|
||||
#
|
||||
# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs
|
||||
# that are giongo, giseigo, or gitaigo.
|
||||
# e.g. に, と
|
||||
助詞-副詞化
|
||||
#
|
||||
# particle-special: A particle that does not fit into one of the above classifications.
|
||||
# This includes particles that are used in Tanka, Haiku, and other poetry.
|
||||
# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家)
|
||||
助詞-特殊
|
||||
#
|
||||
#####
|
||||
# auxiliary-verb:
|
||||
助動詞
|
||||
#
|
||||
#####
|
||||
# interjection: Greetings and other exclamations.
|
||||
# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます,
|
||||
# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
|
||||
#感動詞
|
||||
#
|
||||
#####
|
||||
# symbol: unclassified Symbols.
|
||||
記号
|
||||
#
|
||||
# symbol-misc: A general symbol not in one of the categories below.
|
||||
# e.g. [○◎@$〒→+]
|
||||
記号-一般
|
||||
#
|
||||
# symbol-comma: Commas
|
||||
# e.g. [,、]
|
||||
記号-読点
|
||||
#
|
||||
# symbol-period: Periods and full stops.
|
||||
# e.g. [..。]
|
||||
記号-句点
|
||||
#
|
||||
# symbol-space: Full-width whitespace.
|
||||
記号-空白
|
||||
#
|
||||
# symbol-open_bracket:
|
||||
# e.g. [({‘“『【]
|
||||
記号-括弧開
|
||||
#
|
||||
# symbol-close_bracket:
|
||||
# e.g. [)}’”』」】]
|
||||
記号-括弧閉
|
||||
#
|
||||
# symbol-alphabetic:
|
||||
#記号-アルファベット
|
||||
#
|
||||
#####
|
||||
# other: unclassified other
|
||||
#その他
|
||||
#
|
||||
# other-interjection: Words that are hard to classify as noun-suffixes or
|
||||
# sentence-final particles.
|
||||
# e.g. (だ)ァ
|
||||
その他-間投
|
||||
#
|
||||
#####
|
||||
# filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
|
||||
# e.g. あの, うんと, えと
|
||||
フィラー
|
||||
#
|
||||
#####
|
||||
# non-verbal: non-verbal sound.
|
||||
非言語音
|
||||
#
|
||||
#####
|
||||
# fragment:
|
||||
#語断片
|
||||
#
|
||||
#####
|
||||
# unknown: unknown part of speech.
|
||||
#未知語
|
||||
#
|
||||
##### End of file
|
@ -1,127 +0,0 @@
|
||||
#
|
||||
# This file defines a stopword set for Japanese.
|
||||
#
|
||||
# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
|
||||
# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745
|
||||
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
||||
#
|
||||
# Note that there is an overlap between these stopwords and the terms stopped when used
|
||||
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
|
||||
# that comments are not allowed on the same line as stopwords.
|
||||
#
|
||||
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
||||
# configuration if you need case-sensitive stopping. Lastly, note that stopping is done
|
||||
# using the same character width as the entries in this file. Since this StopFilter is
|
||||
# normally done after a CJKWidthFilter in your chain, you would usually want your romaji
|
||||
# entries to be in half-width and your kana entries to be in full-width.
|
||||
#
|
||||
の
|
||||
に
|
||||
は
|
||||
を
|
||||
た
|
||||
が
|
||||
で
|
||||
て
|
||||
と
|
||||
し
|
||||
れ
|
||||
さ
|
||||
ある
|
||||
いる
|
||||
も
|
||||
する
|
||||
から
|
||||
な
|
||||
こと
|
||||
として
|
||||
い
|
||||
や
|
||||
れる
|
||||
など
|
||||
なっ
|
||||
ない
|
||||
この
|
||||
ため
|
||||
その
|
||||
あっ
|
||||
よう
|
||||
また
|
||||
もの
|
||||
という
|
||||
あり
|
||||
まで
|
||||
られ
|
||||
なる
|
||||
へ
|
||||
か
|
||||
だ
|
||||
これ
|
||||
によって
|
||||
により
|
||||
おり
|
||||
より
|
||||
による
|
||||
ず
|
||||
なり
|
||||
られる
|
||||
において
|
||||
ば
|
||||
なかっ
|
||||
なく
|
||||
しかし
|
||||
について
|
||||
せ
|
||||
だっ
|
||||
その後
|
||||
できる
|
||||
それ
|
||||
う
|
||||
ので
|
||||
なお
|
||||
のみ
|
||||
でき
|
||||
き
|
||||
つ
|
||||
における
|
||||
および
|
||||
いう
|
||||
さらに
|
||||
でも
|
||||
ら
|
||||
たり
|
||||
その他
|
||||
に関する
|
||||
たち
|
||||
ます
|
||||
ん
|
||||
なら
|
||||
に対して
|
||||
特に
|
||||
せる
|
||||
及び
|
||||
これら
|
||||
とき
|
||||
では
|
||||
にて
|
||||
ほか
|
||||
ながら
|
||||
うち
|
||||
そして
|
||||
とともに
|
||||
ただし
|
||||
かつて
|
||||
それぞれ
|
||||
または
|
||||
お
|
||||
ほど
|
||||
ものの
|
||||
に対する
|
||||
ほとんど
|
||||
と共に
|
||||
といった
|
||||
です
|
||||
とも
|
||||
ところ
|
||||
ここ
|
||||
##### End of file
|
@ -45,7 +45,7 @@
|
||||
that avoids logging every request
|
||||
-->
|
||||
|
||||
<schema name="Autopsy Keyword Search" version="2.2">
|
||||
<schema name="Autopsy Keyword Search" version="2.1">
|
||||
<!-- attribute "name" is the name of this schema and is only used for display purposes.
|
||||
Applications should change this to reflect the nature of the search collection.
|
||||
version="1.4" is Solr's version number for the schema syntax and semantics. It should
|
||||
@ -62,7 +62,6 @@
|
||||
2.0 added chunk_size field
|
||||
2.1 to facilitate case insensitive regex search,no longer copying content into content_str.
|
||||
content_str will be populated with lowercase content by Autopsy.
|
||||
2.2 added text_ja type, content_ja and language fields to support Japanese text search
|
||||
-->
|
||||
|
||||
<types>
|
||||
@ -244,18 +243,6 @@
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
|
||||
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
|
||||
<filter class="solr.CJKWidthFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
|
||||
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- A text field with defaults appropriate for English: it
|
||||
tokenizes with StandardTokenizer, removes English stop words
|
||||
(stopwords_en.txt), down cases, protects words from protwords.txt, and
|
||||
@ -570,11 +557,6 @@
|
||||
via copyField further on in this schema -->
|
||||
<field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
|
||||
|
||||
<!-- Store language detection result. Only parents of text documents have this -->
|
||||
<field name="language" type="string" indexed="false" stored="true" required="false"/>
|
||||
|
||||
<field name="content_ja" type="text_ja" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
|
||||
|
||||
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
|
||||
leading wildcard queries. -->
|
||||
<!--<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>-->
|
||||
|
@ -38,7 +38,6 @@ import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrRequest.METHOD;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
@ -347,8 +346,6 @@ class HighlightedText implements IndexedText {
|
||||
String chunkID = "";
|
||||
String highlightField = "";
|
||||
try {
|
||||
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
|
||||
|
||||
loadPageInfo(); //inits once
|
||||
SolrQuery q = new SolrQuery();
|
||||
q.setShowDebugInfo(DEBUG); //debug
|
||||
@ -362,22 +359,6 @@ class HighlightedText implements IndexedText {
|
||||
|
||||
highlightField = LuceneQuery.HIGHLIGHT_FIELD;
|
||||
if (isLiteral) {
|
||||
if (2.2 <= indexSchemaVersion) {
|
||||
//if the query is literal try to get solr to do the highlighting
|
||||
final String highlightQuery = keywords.stream().map(s ->
|
||||
LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
|
||||
.collect(Collectors.joining(" OR "));
|
||||
q.setQuery(highlightQuery);
|
||||
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
|
||||
q.addField(field.toString());
|
||||
q.addHighlightField(field.toString());
|
||||
}
|
||||
q.addField(Server.Schema.LANGUAGE.toString());
|
||||
// in case of single term literal query there is only 1 term
|
||||
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
|
||||
q.addFilterQuery(filterQuery);
|
||||
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
|
||||
} else {
|
||||
//if the query is literal try to get solr to do the highlighting
|
||||
final String highlightQuery = keywords.stream()
|
||||
.map(HighlightedText::constructEscapedSolrQuery)
|
||||
@ -388,20 +369,12 @@ class HighlightedText implements IndexedText {
|
||||
q.addFilterQuery(filterQuery);
|
||||
q.addHighlightField(highlightField);
|
||||
q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
|
||||
}
|
||||
|
||||
//tune the highlighter
|
||||
if (shouldUseOriginalHighlighter(contentIdStr)) {
|
||||
// use original highlighter
|
||||
q.setParam("hl.useFastVectorHighlighter", "off");
|
||||
q.setParam("hl.simple.pre", HIGHLIGHT_PRE);
|
||||
q.setParam("hl.simple.post", HIGHLIGHT_POST);
|
||||
} else {
|
||||
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
|
||||
q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
|
||||
q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
|
||||
q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
|
||||
}
|
||||
|
||||
//docs says makes sense for the original Highlighter only, but not really
|
||||
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
|
||||
@ -432,33 +405,6 @@ class HighlightedText implements IndexedText {
|
||||
|
||||
if (responseHighlightID == null) {
|
||||
highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
|
||||
} else {
|
||||
SolrDocument document = response.getResults().get(0);
|
||||
Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
|
||||
if (2.2 <= indexSchemaVersion && language != null) {
|
||||
List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
|
||||
if (contentHighlights == null) {
|
||||
highlightedContent = "";
|
||||
} else {
|
||||
int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
|
||||
String s = contentHighlights.get(0).trim();
|
||||
// If there is a mini-chunk, trim the content not to show highlighted text in it.
|
||||
if (0 < hitCountInMiniChunk) {
|
||||
int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
|
||||
int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
|
||||
s,
|
||||
HIGHLIGHT_PRE,
|
||||
// trim after the last hit in chunk
|
||||
hitCountInChunk - hitCountInMiniChunk);
|
||||
if (idx != -1) {
|
||||
highlightedContent = s.substring(0, idx);
|
||||
} else {
|
||||
highlightedContent = s;
|
||||
}
|
||||
} else {
|
||||
highlightedContent = s;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
||||
if (contentHighlights == null) {
|
||||
@ -469,7 +415,6 @@ class HighlightedText implements IndexedText {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
highlightedContent = insertAnchors(highlightedContent);
|
||||
|
||||
return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
|
||||
@ -606,37 +551,4 @@ class HighlightedText implements IndexedText {
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if we should use original highlighter instead of FastVectorHighlighter.
|
||||
*
|
||||
* In the case Japanese text and phrase query, FastVectorHighlighter does not work well.
|
||||
*
|
||||
* Note about highlighters:
|
||||
* If the query is "雨が降る" (phrase query), Solr divides it into 雨 and 降る. が is a stop word here.
|
||||
* It seems that FastVector highlighter does not produce any snippet when there is a stop word between terms.
|
||||
* On the other hand, original highlighter produces multiple matches, for example:
|
||||
* > <em>雨</em>が<em>降っ</em>ています
|
||||
* Unified highlighter (from Solr 6.4) handles the case as expected:
|
||||
* > <em>雨が降っ</em>ています。
|
||||
*/
|
||||
private boolean shouldUseOriginalHighlighter(String contentID) throws NoOpenCoreException, KeywordSearchModuleException {
|
||||
final SolrQuery q = new SolrQuery();
|
||||
q.setQuery("*:*");
|
||||
q.addFilterQuery(Server.Schema.ID.toString() + ":" + contentID);
|
||||
q.setFields(Server.Schema.LANGUAGE.toString());
|
||||
|
||||
QueryResponse response = solrServer.query(q, METHOD.POST);
|
||||
SolrDocumentList solrDocuments = response.getResults();
|
||||
|
||||
if (!solrDocuments.isEmpty()) {
|
||||
SolrDocument solrDocument = solrDocuments.get(0);
|
||||
if (solrDocument != null) {
|
||||
Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
|
||||
if (languageField != null) {
|
||||
return languageField.equals("ja");
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -39,7 +39,7 @@ class IndexFinder {
|
||||
private static final String KWS_DATA_FOLDER_NAME = "data";
|
||||
private static final String INDEX_FOLDER_NAME = "index";
|
||||
private static final String CURRENT_SOLR_VERSION = "4";
|
||||
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.2";
|
||||
private static final String CURRENT_SOLR_SCHEMA_VERSION = "2.1";
|
||||
|
||||
static String getCurrentSolrVersion() {
|
||||
return CURRENT_SOLR_VERSION;
|
||||
|
@ -20,10 +20,8 @@ package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
@ -61,8 +59,6 @@ class Ingester {
|
||||
private final Server solrServer = KeywordSearch.getServer();
|
||||
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
||||
private static Ingester instance;
|
||||
private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
|
||||
= new LanguageSpecificContentIndexingHelper();
|
||||
|
||||
private Ingester() {
|
||||
}
|
||||
@ -97,7 +93,7 @@ class Ingester {
|
||||
* file, but the Solr server is probably fine.
|
||||
*/
|
||||
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
|
||||
indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
|
||||
indexChunk("", file.getName().toLowerCase(), getContentFields(file));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -111,7 +107,7 @@ class Ingester {
|
||||
* artifact, but the Solr server is probably fine.
|
||||
*/
|
||||
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
||||
indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
|
||||
indexChunk("", sourceName, getContentFields(artifact));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -147,30 +143,21 @@ class Ingester {
|
||||
< T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
int numChunks = 0; //unknown until chunking is done
|
||||
|
||||
Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
|
||||
Map<String, String> fields = getContentFields(source);
|
||||
//Get a reader for the content of the given source
|
||||
try (BufferedReader reader = new BufferedReader(sourceReader)) {
|
||||
Chunker chunker = new Chunker(reader);
|
||||
while (chunker.hasNext()) {
|
||||
for (Chunk chunk : chunker) {
|
||||
if (context != null && context.fileIngestIsCancelled()) {
|
||||
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
|
||||
return false;
|
||||
}
|
||||
|
||||
Chunk chunk = chunker.next();
|
||||
Map<String, Object> fields = new HashMap<>(contentFields);
|
||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||
fields.put(Server.Schema.ID.toString(), chunkId);
|
||||
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
|
||||
Optional<Language> language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
|
||||
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
|
||||
try {
|
||||
//add the chunk text to Solr index
|
||||
indexChunk(chunk.toString(), sourceName, fields);
|
||||
// add mini chunk when there's a language specific field
|
||||
if (chunker.hasNext() && language.isPresent()) {
|
||||
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
|
||||
}
|
||||
numChunks++;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
|
||||
@ -190,7 +177,6 @@ class Ingester {
|
||||
if (context != null && context.fileIngestIsCancelled()) {
|
||||
return false;
|
||||
} else {
|
||||
Map<String, Object> fields = new HashMap<>(contentFields);
|
||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||
//reset id field to base document id
|
||||
@ -216,7 +202,7 @@ class Ingester {
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
|
||||
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
|
||||
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
||||
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
||||
// but does this really mean we don't want to index it?
|
||||
|
@ -1,46 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Optional;
|
||||
|
||||
/**
|
||||
* Language.
|
||||
*
|
||||
* Contents which are detected to have these languages should be indexed to a corresponding language-specific field
|
||||
* such as content_ja.
|
||||
*/
|
||||
public enum Language {
|
||||
JAPANESE("ja");
|
||||
|
||||
private String value;
|
||||
|
||||
String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
static Optional<Language> fromValue(String value) {
|
||||
return Arrays.stream(Language.values()).filter(x -> x.value.equals(value)).findFirst();
|
||||
}
|
||||
|
||||
Language(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import com.optimaize.langdetect.LanguageDetectorBuilder;
|
||||
import com.optimaize.langdetect.i18n.LdLocale;
|
||||
import com.optimaize.langdetect.ngram.NgramExtractors;
|
||||
import com.optimaize.langdetect.profiles.LanguageProfileReader;
|
||||
import com.optimaize.langdetect.text.CommonTextObjectFactories;
|
||||
import com.optimaize.langdetect.text.TextObject;
|
||||
import com.optimaize.langdetect.text.TextObjectFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Optional;
|
||||
|
||||
/**
|
||||
* Detects the language of the given contents. Only languages which should be indexed to a corresponding
|
||||
* language-specific field are detected.
|
||||
*/
|
||||
class LanguageDetector {
|
||||
|
||||
private com.optimaize.langdetect.LanguageDetector impl;
|
||||
private TextObjectFactory textObjectFactory;
|
||||
|
||||
LanguageDetector() {
|
||||
try {
|
||||
impl = LanguageDetectorBuilder.create(NgramExtractors.standard())
|
||||
.withProfiles(new LanguageProfileReader().readAllBuiltIn())
|
||||
.build();
|
||||
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
|
||||
} catch (IOException e) {
|
||||
// The IOException here could occur when failing to read the language profiles from the classpath.
|
||||
// That can be considered to be a severe IO problem. Nothing can be done here.
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
Optional<Language> detect(String text) {
|
||||
TextObject textObject = textObjectFactory.forText(text);
|
||||
Optional<LdLocale> localeOpt = impl.detect(textObject).transform(Optional::of).or(Optional.empty());
|
||||
return localeOpt.map(LdLocale::getLanguage).flatMap(Language::fromValue);
|
||||
}
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
|
||||
import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
/**
|
||||
* A helper class to support indexing language-specific fields.
|
||||
*/
|
||||
class LanguageSpecificContentIndexingHelper {
|
||||
|
||||
private final LanguageDetector languageDetector = new LanguageDetector();
|
||||
|
||||
Optional<Language> detectLanguageIfNeeded(Chunker.Chunk chunk) throws NoOpenCoreException {
|
||||
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
||||
if (2.2 <= indexSchemaVersion) {
|
||||
return languageDetector.detect(chunk.toString());
|
||||
} else {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
void updateLanguageSpecificFields(Map<String, Object> fields, Chunker.Chunk chunk, Language language) {
|
||||
List<String> values = new ArrayList<>();
|
||||
values.add(chunk.toString());
|
||||
if (fields.containsKey(Server.Schema.FILE_NAME.toString())) {
|
||||
values.add(fields.get(Server.Schema.FILE_NAME.toString()).toString());
|
||||
}
|
||||
|
||||
// index the chunk to a language specific field
|
||||
fields.put(Server.Schema.CONTENT_JA.toString(), values);
|
||||
fields.put(Server.Schema.LANGUAGE.toString(), language.getValue());
|
||||
}
|
||||
|
||||
void indexMiniChunk(Chunker.Chunk chunk, String sourceName, Map<String, Object> fields, String baseChunkID, Language language)
|
||||
throws Ingester.IngesterException {
|
||||
//Make a SolrInputDocument out of the field map
|
||||
SolrInputDocument updateDoc = new SolrInputDocument();
|
||||
for (String key : fields.keySet()) {
|
||||
updateDoc.addField(key, fields.get(key));
|
||||
}
|
||||
|
||||
try {
|
||||
updateDoc.setField(Server.Schema.ID.toString(), MiniChunkHelper.getChunkIdString(baseChunkID));
|
||||
|
||||
// index the chunk to a language specific field
|
||||
updateDoc.addField(Server.Schema.CONTENT_JA.toString(), chunk.toString().substring(chunk.getBaseChunkLength()));
|
||||
updateDoc.addField(Server.Schema.LANGUAGE.toString(), language.getValue());
|
||||
|
||||
TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
|
||||
|
||||
KeywordSearch.getServer().addDocument(updateDoc);
|
||||
HealthMonitor.submitTimingMetric(metric);
|
||||
|
||||
} catch (KeywordSearchModuleException | NoOpenCoreException ex) {
|
||||
throw new Ingester.IngesterException(
|
||||
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,248 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrRequest;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
|
||||
import org.sleuthkit.autopsy.coreutils.Version;
|
||||
import org.sleuthkit.datamodel.TskException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* A helper class to support querying documents which have language-specific fields.
|
||||
*/
|
||||
final class LanguageSpecificContentQueryHelper {
|
||||
|
||||
private LanguageSpecificContentQueryHelper() {}
|
||||
|
||||
private static final List<Server.Schema> QUERY_FIELDS = new ArrayList<>();
|
||||
private static final List<Server.Schema> LANGUAGE_SPECIFIC_CONTENT_FIELDS
|
||||
= Collections.singletonList(Server.Schema.CONTENT_JA);
|
||||
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
|
||||
|
||||
static {
|
||||
QUERY_FIELDS.add(Server.Schema.TEXT);
|
||||
QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds query response for later processes related to language-specific fields
|
||||
*/
|
||||
static class QueryResults {
|
||||
List<SolrDocument> chunks = new ArrayList<>();
|
||||
Map</* ID */ String, SolrDocument> miniChunks = new HashMap<>();
|
||||
// objectId_chunk -> "text" -> List of previews
|
||||
Map<String, Map<String, List<String>>> highlighting = new HashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a query string from the given one by applying it to the multiple query fields
|
||||
*
|
||||
* @param queryStr escaped query string
|
||||
* @return query string
|
||||
*/
|
||||
static String expandQueryString(final String queryStr) {
|
||||
List<String> fieldQueries = new ArrayList<>();
|
||||
fieldQueries.add(Server.Schema.TEXT.toString() + ":" + queryStr);
|
||||
fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + ":" + queryStr).collect(Collectors.toList()));
|
||||
return String.join(" OR ", fieldQueries);
|
||||
}
|
||||
|
||||
static List<Server.Schema> getQueryFields() {
|
||||
return QUERY_FIELDS;
|
||||
}
|
||||
|
||||
static void updateQueryResults(QueryResults results, SolrDocument document) {
|
||||
String id = (String) document.getFieldValue(Server.Schema.ID.toString());
|
||||
if (MiniChunkHelper.isMiniChunkID(id)) {
|
||||
results.miniChunks.put(MiniChunkHelper.getBaseChunkID(id), document);
|
||||
} else {
|
||||
results.chunks.add(document);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get snippets
|
||||
*
|
||||
* @param highlight field ID -> snippets
|
||||
* @return snippets of appropriate fields.
|
||||
* Note that this method returns {@code Optional.empty} if the result is empty for convenience to interact with the existing code.
|
||||
*/
|
||||
static Optional<List<String>> getHighlights(Map<String, List<String>> highlight) {
|
||||
for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
|
||||
if (highlight.containsKey(field.toString())) {
|
||||
return Optional.of(highlight.get(field.toString()));
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge KeywordHits from TEXT field and a language specific field
|
||||
*
|
||||
* Replace KeywordHits in the given {@code matches} if its chunk ID is same.
|
||||
*/
|
||||
static List<KeywordHit> mergeKeywordHits(List<KeywordHit> matches, Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
|
||||
Map<String, KeywordHit> map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x));
|
||||
List<KeywordHit> merged = new ArrayList<>();
|
||||
|
||||
// first, replace KeywordHit in matches
|
||||
for (KeywordHit match : matches) {
|
||||
String key = match.getSolrDocumentId();
|
||||
if (map.containsKey(key)) {
|
||||
merged.add(map.get(key));
|
||||
map.remove(key);
|
||||
} else {
|
||||
merged.add(match);
|
||||
}
|
||||
}
|
||||
// second, add rest of KeywordHits from queryResults
|
||||
merged.addAll(map.values());
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
||||
static void configureTermfreqQuery(SolrQuery query, String keyword) throws KeywordSearchModuleException, NoOpenCoreException {
|
||||
// make a request to Solr to parse query.
|
||||
QueryTermHelper.Result queryParserResult = QueryTermHelper.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS);
|
||||
query.addField(buildTermfreqQuery(keyword, queryParserResult));
|
||||
}
|
||||
|
||||
static String buildTermfreqQuery(String keyword, QueryTermHelper.Result result) {
|
||||
List<String> termfreqs = new ArrayList<>();
|
||||
for (Map.Entry<String, List<String>> e : result.fieldTermsMap.entrySet()) {
|
||||
String field = e.getKey();
|
||||
for (String term : e.getValue()) {
|
||||
termfreqs.add(String.format("termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term)));
|
||||
}
|
||||
}
|
||||
|
||||
// sum of all language specific query fields.
|
||||
// only one of these fields could be non-zero.
|
||||
return String.format("termfreq:sum(%s)", String.join(",", termfreqs));
|
||||
}
|
||||
|
||||
static int queryChunkTermfreq(Set<String> keywords, String contentID) throws KeywordSearchModuleException, NoOpenCoreException {
|
||||
SolrQuery q = new SolrQuery();
|
||||
q.setShowDebugInfo(DEBUG);
|
||||
|
||||
final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentID);
|
||||
final String highlightQuery = keywords.stream()
|
||||
.map(s -> LanguageSpecificContentQueryHelper.expandQueryString(
|
||||
KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
|
||||
.collect(Collectors.joining(" "));
|
||||
|
||||
q.addFilterQuery(filterQuery);
|
||||
q.setQuery(highlightQuery);
|
||||
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
|
||||
|
||||
QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
|
||||
SolrDocumentList results = response.getResults();
|
||||
if (results.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SolrDocument document = results.get(0);
|
||||
return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
|
||||
}
|
||||
|
||||
static int findNthIndexOf(String s, String pattern, int n) {
|
||||
int found = 0;
|
||||
int idx = -1;
|
||||
int len = s.length();
|
||||
while (idx < len && found <= n) {
|
||||
idx = s.indexOf(pattern, idx + 1);
|
||||
if (idx == -1) {
|
||||
break;
|
||||
}
|
||||
found++;
|
||||
}
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
private static List<KeywordHit> findMatches(Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
|
||||
List<KeywordHit> matches = new ArrayList<>();
|
||||
for (SolrDocument document : queryResults.chunks) {
|
||||
String docId = (String) document.getFieldValue(Server.Schema.ID.toString());
|
||||
|
||||
try {
|
||||
int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
|
||||
SolrDocument miniChunk = queryResults.miniChunks.get(docId);
|
||||
if (miniChunk == null) {
|
||||
// last chunk does not have mini chunk because there's no overlapped region with next one
|
||||
matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
|
||||
} else {
|
||||
int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
|
||||
if (hitCountInMiniChunk < hitCountInChunk) {
|
||||
// there are at least one hit in base chunk
|
||||
matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
|
||||
}
|
||||
}
|
||||
} catch (TskException ex) {
|
||||
throw new KeywordSearchModuleException(ex);
|
||||
}
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
|
||||
/**
|
||||
* copied from LuceneQuery and modified to use getHighlightFieldValue
|
||||
*/
|
||||
private static KeywordHit createKeywordHit(Keyword originalKeyword, Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {
|
||||
/**
|
||||
* Get the first snippet from the document if keyword search is
|
||||
* configured to use snippets.
|
||||
*/
|
||||
String snippet = "";
|
||||
if (KeywordSearchSettings.getShowSnippets()) {
|
||||
List<String> snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null);
|
||||
// list is null if there wasn't a snippet
|
||||
if (snippetList != null) {
|
||||
snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
|
||||
}
|
||||
}
|
||||
|
||||
return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Optional.empty if empty
|
||||
*/
|
||||
private static Optional<List<String>> getHighlightFieldValue(Map<String, List<String>> highlight) {
|
||||
for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
|
||||
if (highlight.containsKey(field.toString())) {
|
||||
return Optional.of(highlight.get(field.toString()));
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
@ -134,7 +134,6 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
|
||||
boolean allResultsProcessed = false;
|
||||
List<KeywordHit> matches = new ArrayList<>();
|
||||
LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();
|
||||
while (!allResultsProcessed) {
|
||||
solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
|
||||
QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
|
||||
@ -142,18 +141,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
// objectId_chunk -> "text" -> List of previews
|
||||
Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
|
||||
|
||||
if (2.2 <= indexSchemaVersion) {
|
||||
languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());
|
||||
}
|
||||
|
||||
for (SolrDocument resultDoc : resultList) {
|
||||
if (2.2 <= indexSchemaVersion) {
|
||||
Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());
|
||||
if (language != null) {
|
||||
LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
/*
|
||||
* for each result doc, check that the first occurence of
|
||||
@ -165,11 +153,6 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
|
||||
final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
|
||||
|
||||
// if the document has language, it should be hit in language specific content fields. So skip here.
|
||||
if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (indexSchemaVersion < 2.0) {
|
||||
//old schema versions don't support chunk_size or the content_str fields, so just accept hits
|
||||
matches.add(createKeywordtHit(highlightResponse, docId));
|
||||
@ -196,16 +179,9 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
cursorMark = nextCursorMark;
|
||||
}
|
||||
|
||||
List<KeywordHit> mergedMatches;
|
||||
if (2.2 <= indexSchemaVersion) {
|
||||
mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);
|
||||
} else {
|
||||
mergedMatches = matches;
|
||||
}
|
||||
|
||||
QueryResults results = new QueryResults(this);
|
||||
//in case of single term literal query there is only 1 term
|
||||
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);
|
||||
results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches);
|
||||
|
||||
return results;
|
||||
}
|
||||
@ -286,9 +262,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {
|
||||
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
||||
|
||||
private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
|
||||
SolrQuery q = new SolrQuery();
|
||||
q.setShowDebugInfo(DEBUG); //debug
|
||||
// Wrap the query string in quotes if this is a literal search term.
|
||||
@ -299,12 +273,8 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
if (field != null) {
|
||||
//use the optional field
|
||||
queryStr = field + ":" + queryStr;
|
||||
q.setQuery(queryStr);
|
||||
} else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
|
||||
q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(queryStr));
|
||||
} else {
|
||||
q.setQuery(queryStr);
|
||||
}
|
||||
q.setQuery(queryStr);
|
||||
q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
|
||||
// Setting the sort order is necessary for cursor based paging to work.
|
||||
q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
|
||||
@ -313,11 +283,6 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
Server.Schema.CHUNK_SIZE.toString(),
|
||||
Server.Schema.CONTENT_STR.toString());
|
||||
|
||||
if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {
|
||||
q.addField(Server.Schema.LANGUAGE.toString());
|
||||
LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);
|
||||
}
|
||||
|
||||
for (KeywordQueryFilter filter : filters) {
|
||||
q.addFilterQuery(filter.toString());
|
||||
}
|
||||
@ -335,16 +300,8 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
*
|
||||
* @param q The SolrQuery to configure.
|
||||
*/
|
||||
private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {
|
||||
double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());
|
||||
if (2.2 <= indexSchemaVersion) {
|
||||
for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
|
||||
q.addHighlightField(field.toString());
|
||||
}
|
||||
} else {
|
||||
private static void configurwQueryForHighlighting(SolrQuery q) {
|
||||
q.addHighlightField(HIGHLIGHT_FIELD);
|
||||
}
|
||||
|
||||
q.setHighlightSnippets(1);
|
||||
q.setHighlightFragsize(SNIPPET_LENGTH);
|
||||
|
||||
@ -447,13 +404,7 @@ class LuceneQuery implements KeywordSearchQuery {
|
||||
if (responseHighlightID == null) {
|
||||
return "";
|
||||
}
|
||||
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
|
||||
List<String> contentHighlights;
|
||||
if (2.2 <= indexSchemaVersion) {
|
||||
contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
|
||||
} else {
|
||||
contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
||||
}
|
||||
List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
|
||||
if (contentHighlights == null) {
|
||||
return "";
|
||||
} else {
|
||||
|
@ -1,41 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
/**
|
||||
* Mini-chunk related methods.
|
||||
*/
|
||||
final class MiniChunkHelper {
|
||||
|
||||
private MiniChunkHelper() {}
|
||||
|
||||
static String SUFFIX = "_mini";
|
||||
|
||||
static String getChunkIdString(String baseChunkID) {
|
||||
return baseChunkID + SUFFIX;
|
||||
}
|
||||
|
||||
static boolean isMiniChunkID(String chunkID) {
|
||||
return chunkID.endsWith(SUFFIX);
|
||||
}
|
||||
|
||||
static String getBaseChunkID(String miniChunkID) {
|
||||
return miniChunkID.replaceFirst(SUFFIX + "$", "");
|
||||
}
|
||||
}
|
@ -1,95 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.request.FieldAnalysisRequest;
|
||||
import org.apache.solr.client.solrj.response.AnalysisResponseBase;
|
||||
import org.apache.solr.client.solrj.response.FieldAnalysisResponse;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Get terms from query using Solr.
|
||||
*
|
||||
* This class is used to find matched terms from query results.
|
||||
*/
|
||||
final class QueryTermHelper {
|
||||
|
||||
private QueryTermHelper() {}
|
||||
|
||||
/**
|
||||
* Result of {@link #parse} method
|
||||
*/
|
||||
static class Result {
|
||||
/**
|
||||
* field name -> [term]
|
||||
*/
|
||||
final Map<String, List<String>> fieldTermsMap = new HashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the given query string on Solr and return the result
|
||||
*
|
||||
* @param query query to parse
|
||||
* @param fields field names to use for parsing
|
||||
*/
|
||||
static Result parse(String query, List<Server.Schema> fields) throws KeywordSearchModuleException, NoOpenCoreException {
|
||||
Server server = KeywordSearch.getServer();
|
||||
|
||||
FieldAnalysisRequest request = new FieldAnalysisRequest();
|
||||
for (Server.Schema field : fields) {
|
||||
request.addFieldName(field.toString());
|
||||
}
|
||||
// FieldAnalysisRequest requires to set its field value property,
|
||||
// while the corresponding analysis.fieldvalue parameter is not needed in the API.
|
||||
// Setting an empty value does not effect on the result.
|
||||
request.setFieldValue("");
|
||||
request.setQuery(query);
|
||||
|
||||
FieldAnalysisResponse response = new FieldAnalysisResponse();
|
||||
try {
|
||||
response.setResponse(server.request(request));
|
||||
} catch (SolrServerException e) {
|
||||
throw new KeywordSearchModuleException(e);
|
||||
}
|
||||
|
||||
Result result = new Result();
|
||||
for (Map.Entry<String, FieldAnalysisResponse.Analysis> entry : response.getAllFieldNameAnalysis()) {
|
||||
Iterator<AnalysisResponseBase.AnalysisPhase> it = entry.getValue().getQueryPhases().iterator();
|
||||
|
||||
// The last phase is the one which is used in the search process.
|
||||
AnalysisResponseBase.AnalysisPhase lastPhase = null;
|
||||
while (it.hasNext()) {
|
||||
lastPhase = it.next();
|
||||
}
|
||||
|
||||
if (lastPhase != null) {
|
||||
List<String> tokens = lastPhase.getTokens().stream().map(AnalysisResponseBase.TokenInfo::getText).collect(Collectors.toList());
|
||||
result.fieldTermsMap.put(entry.getKey(), tokens);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
@ -130,18 +130,6 @@ public class Server {
|
||||
return "content_ws"; //NON-NLS
|
||||
}
|
||||
},
|
||||
CONTENT_JA {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "content_ja"; //NON-NLS
|
||||
}
|
||||
},
|
||||
LANGUAGE {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "language"; //NON-NLS
|
||||
}
|
||||
},
|
||||
FILE_NAME {
|
||||
@Override
|
||||
public String toString() {
|
||||
@ -187,17 +175,6 @@ public class Server {
|
||||
public String toString() {
|
||||
return "chunk_size"; //NON-NLS
|
||||
}
|
||||
},
|
||||
/**
|
||||
* termfreq is a function which returns the number of times the term appears.
|
||||
* This is not an actual field defined in schema.xml, but can be gotten from returned documents
|
||||
* in the same way as fields.
|
||||
*/
|
||||
TERMFREQ {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "termfreq"; //NON-NLS
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -1658,8 +1635,7 @@ public class Server {
|
||||
private int queryNumFileChunks(long contentID) throws SolrServerException, IOException {
|
||||
String id = KeywordSearchUtil.escapeLuceneQuery(Long.toString(contentID));
|
||||
final SolrQuery q
|
||||
= new SolrQuery(Server.Schema.ID + ":" + id + Server.CHUNK_ID_SEPARATOR + "*"
|
||||
+ " NOT " + Server.Schema.ID + ":*" + MiniChunkHelper.SUFFIX);
|
||||
= new SolrQuery(Server.Schema.ID + ":" + id + Server.CHUNK_ID_SEPARATOR + "*");
|
||||
q.setRows(0);
|
||||
return (int) query(q).getResults().getNumFound();
|
||||
}
|
||||
|
@ -1,59 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
/**
|
||||
* tests for LanguageSpecificContentQueryHelper
|
||||
*/
|
||||
public class LanguageSpecificContentQueryHelperTest {
|
||||
|
||||
@Test
|
||||
public void makeQueryString() {
|
||||
assertEquals("text:query OR content_ja:query", LanguageSpecificContentQueryHelper.expandQueryString("query"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void findNthIndexOf() {
|
||||
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "_", 0));
|
||||
assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 0));
|
||||
assertEquals(2, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 1));
|
||||
assertEquals(3, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 2));
|
||||
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 3));
|
||||
assertEquals(0, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "", 0));
|
||||
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("", "A", 0));
|
||||
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", -1));
|
||||
assertEquals(-1, LanguageSpecificContentQueryHelper.findNthIndexOf("A1AA45", "A", 999));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void buildTermfreqQuery() {
|
||||
QueryTermHelper.Result result = new QueryTermHelper.Result();
|
||||
result.fieldTermsMap.put("field1", Arrays.asList("term1"));
|
||||
result.fieldTermsMap.put("field2", Arrays.asList("term1", "term2"));
|
||||
assertEquals(
|
||||
"termfreq:sum(termfreq(\"field1\",\"term1\"),termfreq(\"field2\",\"term1\"),termfreq(\"field2\",\"term2\"))",
|
||||
LanguageSpecificContentQueryHelper.buildTermfreqQuery("query", result));
|
||||
}
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2019 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
/**
|
||||
* tests for MiniChunkHelper
|
||||
*/
|
||||
public class MiniChunkHelperTest {
|
||||
|
||||
@Test
|
||||
public void isMiniChunkID() {
|
||||
assertTrue(MiniChunkHelper.isMiniChunkID("1_1_mini"));
|
||||
assertFalse(MiniChunkHelper.isMiniChunkID("1_1"));
|
||||
assertFalse(MiniChunkHelper.isMiniChunkID("1"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getBaseChunkID() {
|
||||
Assert.assertEquals("1_1", MiniChunkHelper.getBaseChunkID("1_1_mini"));
|
||||
Assert.assertEquals("1_1", MiniChunkHelper.getBaseChunkID("1_1"));
|
||||
Assert.assertEquals("1", MiniChunkHelper.getBaseChunkID("1"));
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user