Revert "Removed workaround for bug in Solr 4.0."

This reverts commit 46389f13f509ab47d21d9d4eb32037df8eb6008a.
This commit is contained in:
Eamonn Saunders 2015-03-30 15:10:38 -04:00
parent edf2b8018b
commit c8b8530386

View File

@ -209,8 +209,9 @@
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- workaround to bug in Solr 4.0 to set LimitTokenCountFilterFactory maxTokenCount, might change to single attribute in future -->
<!-- 200000 token limit ensures we are indexing entire 1MB chunk of meaningful tokens, increase the limit for larger chunks -->
<filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="200000"/>
<filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="val" val="200000"/>
</analyzer>
</fieldType>
@ -222,8 +223,9 @@
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="200000"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<!-- workaround to bug in Solr 4.0 to set LimitTokenCountFilterFactory maxTokenCount, might change to single attribute in future -->
<filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="val" val="200000"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt" enablePositionIncrements="true" />
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
@ -231,12 +233,12 @@
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="200000"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<!-- workaround to bug in Solr 4.0 to set LimitTokenCountFilterFactory maxTokenCount, might change to single attribute in future -->
<filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="val" val="200000"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt" enablePositionIncrements="true" />
<!--<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>-->
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field with defaults appropriate for English: it
@ -507,17 +509,19 @@
<!-- use image_id to easily search a specific image only -->
<field name="image_id" type="string" indexed="true" stored="false" required="true" />
<!-- Autopsy pushes text to the content field and gets the text to display from it. It is copied to other places -->
<field name="content" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<!-- Autopsy pushes text to this field and gets the text to display from it. It is copied to other places -->
<field name="content" type="string" indexed="false" stored="false" />
<!-- The strings field holds strings extracted from files that SolrCell doesn't support -->
<!--<field name="strings" type="text_general" indexed="true" stored="true"/>-->
<!-- NOTE: file_name gets copied later to other fields for searching -->
<field name="file_name" type="text_general" indexed="false" stored="true"/>
<field name="ctime" type="tdate" indexed="false" stored="false"/>
<field name="atime" type="tdate" indexed="false" stored="false"/>
<field name="mtime" type="tdate" indexed="false" stored="false"/>
<field name="crtime" type="tdate" indexed="false" stored="false"/>
<!-- file chunk-specific fields (optional for others) -->
<!-- for a parent file with no content, number of chunks are specified -->
<field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
@ -527,6 +531,7 @@
Some fields are multiValued only because Tika currently may return
multiple values for them.
-->
<!--
<field name="title" type="text_general" indexed="false" stored="false" multiValued="true"/>
<field name="subject" type="text_general" indexed="false" stored="false"/>
<field name="description" type="text_general" indexed="false" stored="false"/>
@ -537,13 +542,14 @@
<field name="content_type" type="string" indexed="false" stored="false" multiValued="true"/>
<field name="last_modified" type="date" indexed="false" stored="false"/>
<field name="links" type="string" indexed="false" stored="false" multiValued="true"/>
-->
<!-- Tika places all metadata into a multivalued field named "meta" -->
<field name="meta" type="text_general" indexed="true" stored="true" multiValued="true"/>
<!--<field name="meta" type="text_general" indexed="true" stored="true" multiValued="true"/> -->
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
<field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
@ -552,7 +558,7 @@
<!-- field with white-space tokenized words for TermsComponent regex search (useful for fast search of IP addresses, URLs, certain phone numbers)
also be useful for Lucene based queries containing special characters-->
<!-- populated via copyField -->
<field name="content_ws" type="text_ws" indexed="true" stored="false" />
<field name="content_ws" type="text_ws" indexed="true" stored="false" multiValued="true" />
<!-- Uncommenting the following will create a "timestamp" field using
a default value of "NOW" to indicate when each document was indexed.
@ -569,6 +575,7 @@
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used. -->
<!--
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
@ -577,15 +584,19 @@
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
-->
<!-- Type used to index the lat and lon components for the "location" FieldType -->
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/>
<!--
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
-->
<!-- some trie-coded dynamic fields for faster range queries -->
<dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
<!--
<dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
<dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
<dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
<dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
@ -597,6 +608,7 @@
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="random_*" type="random" />
-->
<!-- uncomment the following to ignore any fields that don't already match an existing
field name or dynamic field, rather than reporting them as an error.
@ -623,9 +635,12 @@
<copyField source="content" dest="text"/>
<copyField source="file_name" dest="text"/>
<copyField source="meta" dest="text"/>
<!--<copyField source="meta" dest="text"/>-->
<!--<copyField source="strings" dest="text"/>-->
<copyField source="content" dest="content_ws"/>
<copyField source="file_name" dest="content_ws"/>
<!--<copyField source="meta" dest="content_ws"/>-->
<!-- Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same