use same given charset

This commit is contained in:
momo 2015-10-15 17:08:00 -04:00
parent e0547aaebf
commit bb0dc09925

View File

@ -50,12 +50,14 @@ class AbstractFileChunk {
* *
* @return * @return
*/ */
public String getIdString() { String getIdString() {
return Server.getChunkIdString(this.parent.getSourceFile().getId(), this.chunkID); return Server.getChunkIdString(this.parent.getSourceFile().getId(), this.chunkID);
} }
public boolean index(Ingester ingester, byte[] content, long contentSize, Charset indexCharset) throws IngesterException { void index(Ingester ingester, byte[] content, long contentSize, Charset indexCharset) throws IngesterException {
byte[] saitizedContent = sanitize(content); // We are currently only passing utf-8 as indexCharset. If other charsets were to be used in the future,
// this might need to be changed to accommodate.
byte[] saitizedContent = sanitize(content, indexCharset);
ByteContentStream bcs = new ByteContentStream(saitizedContent, contentSize, parent.getSourceFile(), indexCharset); ByteContentStream bcs = new ByteContentStream(saitizedContent, contentSize, parent.getSourceFile(), indexCharset);
try { try {
ingester.ingest(this, bcs, content.length); ingester.ingest(this, bcs, content.length);
@ -63,16 +65,14 @@ class AbstractFileChunk {
throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.exception.msg", throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.exception.msg",
parent.getSourceFile().getId(), chunkID), ingEx); parent.getSourceFile().getId(), chunkID), ingEx);
} }
return true;
} }
// Given a byte array, filter out all occurances non-characters // Given a byte array, filter out all occurances non-characters
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
// and non-printable control characters except tabulator, new line and carriage return // and non-printable control characters except tabulator, new line and carriage return
// and replace them with the character (^) // and replace them with the character (^)
private static byte[] sanitize(byte[] input) { private static byte[] sanitize(byte[] input, Charset indexCharset) {
Charset charset = Charset.forName("UTF-8"); // NON-NLS String inputString = new String(input, indexCharset);
String inputString = new String(input, charset);
StringBuilder sanitized = new StringBuilder(inputString.length()); StringBuilder sanitized = new StringBuilder(inputString.length());
char ch; char ch;
for (int i = 0; i < inputString.length(); i++) { for (int i = 0; i < inputString.length(); i++) {
@ -84,7 +84,7 @@ class AbstractFileChunk {
} }
} }
byte[] output = sanitized.toString().getBytes(charset); byte[] output = sanitized.toString().getBytes(indexCharset);
return output; return output;
} }