adding new comments for solr characters

This commit is contained in:
momo 2015-10-05 09:45:30 -04:00
parent 1a05a07cc6
commit b3ae70f2b9

View File

@ -66,8 +66,10 @@ class AbstractFileChunk {
return true; return true;
} }
// Given a byte array, filter out all occurances of invalid (illegal) UTF-8 // Given a byte array, filter out all occurances non-characters
// characters and replace them with the question mark character (?) // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
// and non-printable control characters except tabulator, new line and carriage return
// and replace them with the question mark character (?)
private static byte[] sanitize(byte[] input) { private static byte[] sanitize(byte[] input) {
Charset charset = Charset.forName("UTF-8"); // NON-NLS Charset charset = Charset.forName("UTF-8"); // NON-NLS
String inputString = new String(input, charset); String inputString = new String(input, charset);