Revert "Removed workaround for bug in Solr 4.0."

This reverts commit 46389f13f509ab47d21d9d4eb32037df8eb6008a.
2025-07-12 16:06:15 +00:00 · 2015-03-30 15:10:38 -04:00 · 2015-03-30 15:10:38 -04:00 · c8b8530386
commit c8b8530386
parent edf2b8018b
1 changed files with 34 additions and 19 deletions
--- a/KeywordSearch/release/solr/solr/conf/schema.xml
+++ b/KeywordSearch/release/solr/solr/conf/schema.xml
@ -209,8 +209,9 @@
    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!-- workaround to bug in Solr 4.0 to set LimitTokenCountFilterFactory maxTokenCount, might change to single attribute in future -->
        <!-- 200000 token limit ensures we are indexing entire 1MB chunk of meaningful tokens, increase the limit for larger chunks -->
-        <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="200000"/>
+        <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="val" val="200000"/>
      </analyzer>
    </fieldType>
 	
@ -222,8 +223,9 @@
    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>
-        <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="200000"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <!-- workaround to bug in Solr 4.0 to set LimitTokenCountFilterFactory maxTokenCount, might change to single attribute in future -->
+        <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="val" val="200000"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt" enablePositionIncrements="true" />
        <!-- in this example, we will only use synonyms at query time
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
@ -231,12 +233,12 @@
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
-        <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="200000"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <!-- workaround to bug in Solr 4.0 to set LimitTokenCountFilterFactory maxTokenCount, might change to single attribute in future -->
+        <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="val" val="200000"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt" enablePositionIncrements="true" />
+        <!--<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>-->
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
-	
    </fieldType>

    <!-- A text field with defaults appropriate for English: it
@ -507,17 +509,19 @@
   <!-- use image_id to easily search a specific image only -->
   <field name="image_id" type="string" indexed="true" stored="false" required="true" /> 
   
-    <!-- Autopsy pushes text to the content field and gets the text to display from it.  It is copied to other places -->
-   <field name="content" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
+    <!-- Autopsy pushes text to this field and gets the text to display from it.  It is copied to other places -->
+   <field name="content" type="string" indexed="false" stored="false" />
   
   <!-- The strings field holds strings extracted from files that SolrCell doesn't support -->
   <!--<field name="strings" type="text_general" indexed="true" stored="true"/>-->
   
+   <!-- NOTE: file_name gets copied later to other fields for searching -->
   <field name="file_name" type="text_general" indexed="false" stored="true"/>
   <field name="ctime" type="tdate" indexed="false" stored="false"/>
   <field name="atime" type="tdate" indexed="false" stored="false"/>
   <field name="mtime" type="tdate" indexed="false" stored="false"/>
   <field name="crtime" type="tdate" indexed="false" stored="false"/>
+
   <!-- file chunk-specific fields (optional for others) -->
   <!-- for a parent file with no content, number of chunks are specified -->
   <field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
@ -527,6 +531,7 @@
     Some fields are multiValued only because Tika currently may return
     multiple values for them.
   -->
+   <!--
   <field name="title" type="text_general" indexed="false" stored="false" multiValued="true"/>
   <field name="subject" type="text_general" indexed="false" stored="false"/>
   <field name="description" type="text_general" indexed="false" stored="false"/>
@ -537,13 +542,14 @@
   <field name="content_type" type="string" indexed="false" stored="false" multiValued="true"/>
   <field name="last_modified" type="date" indexed="false" stored="false"/>
   <field name="links" type="string" indexed="false" stored="false" multiValued="true"/>
+    -->
    
   <!-- Tika places all metadata into a multivalued field named "meta" -->
-   <field name="meta" type="text_general" indexed="true" stored="true" multiValued="true"/>
+   <!--<field name="meta" type="text_general" indexed="true" stored="true" multiValued="true"/> -->

   <!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
-   <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
+   <field name="text" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" multiValued="true"/>

   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
        leading wildcard queries. -->
@ -552,7 +558,7 @@
   <!-- field with white-space tokenized words for TermsComponent regex search (useful for fast search of IP addresses, URLs, certain phone numbers)
 		also be useful for Lucene based queries containing special characters-->
   <!-- populated via copyField -->
-   <field name="content_ws" type="text_ws" indexed="true" stored="false" /> 
+   <field name="content_ws" type="text_ws" indexed="true" stored="false" multiValued="true" /> 
 	
   <!-- Uncommenting the following will create a "timestamp" field using
        a default value of "NOW" to indicate when each document was indexed.
@ -569,6 +575,7 @@
        EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
        Longer patterns will be matched first.  if equal size patterns
        both match, the first appearing in the schema will be used.  -->
+        <!--
   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
@ -577,15 +584,19 @@
   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
+    -->
    
   <!-- Type used to index the lat and lon components for the "location" FieldType -->
-   <dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="false"/>
+   <!--
+    <dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="false"/>

   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
+    -->
    
   <!-- some trie-coded dynamic fields for faster range queries -->
-   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
+   <!--
+    <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
   <dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
   <dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
   <dynamicField name="*_td" type="tdouble" indexed="true"  stored="true"/>
@ -597,6 +608,7 @@
   <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>

   <dynamicField name="random_*" type="random" />
+    -->
    
   <!-- uncomment the following to ignore any fields that don't already match an existing 
        field name or dynamic field, rather than reporting them as an error. 
@ -623,9 +635,12 @@

   <copyField source="content" dest="text"/>
   <copyField source="file_name" dest="text"/>
-   <copyField source="meta" dest="text"/>
+   <!--<copyField source="meta" dest="text"/>-->
   <!--<copyField source="strings" dest="text"/>-->
+   
   <copyField source="content" dest="content_ws"/>
+   <copyField source="file_name" dest="content_ws"/>
+   <!--<copyField source="meta" dest="content_ws"/>-->
 	
   <!-- Above, multiple source fields are copied to the [text] field. 
 	  Another way to map multiple source fields to the same