adding new comments for solr characters

This commit is contained in:
momo 2015-10-05 09:45:30 -04:00
parent 1a05a07cc6
commit b3ae70f2b9

View File

@ -66,8 +66,10 @@ class AbstractFileChunk {
return true;
}
// Given a byte array, filter out all occurances of invalid (illegal) UTF-8
// characters and replace them with the question mark character (?)
// Given a byte array, filter out all occurances non-characters
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
// and non-printable control characters except tabulator, new line and carriage return
// and replace them with the question mark character (?)
private static byte[] sanitize(byte[] input) {
Charset charset = Charset.forName("UTF-8"); // NON-NLS
String inputString = new String(input, charset);