Merge pull request #1463 from sidheshenator/sanitize_utf-8-solr

invalid utf-8 data sanitized before feeding to the Solr Ingester
This commit is contained in:
Brian Carrier 2015-07-30 10:53:39 -04:00
commit 1eb39955fc
2 changed files with 23 additions and 3 deletions

View File

@ -19,9 +19,15 @@
package org.sleuthkit.autopsy.keywordsearch;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
/**
@ -30,10 +36,16 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
class AbstractFileChunk {
private int chunkID;
private TextExtractor parent;
private final CharsetDecoder charsetDecoder = Charset.forName("utf-8").newDecoder(); // NON-NLS
private final String replacement_string = "?";
private static final Logger logger = Logger.getLogger(AbstractFileChunk.class.getName());
AbstractFileChunk(TextExtractor parent, int chunkID) {
this.parent = parent;
this.chunkID = chunkID;
this.charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
this.charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
this.charsetDecoder.replaceWith(replacement_string); // white questionmark in black diamond - Replacement Character U+FFFD
}
public TextExtractor getParent() {
@ -55,9 +67,16 @@ class AbstractFileChunk {
public boolean index(Ingester ingester, byte[] content, long contentSize, Charset indexCharset) throws IngesterException {
boolean success = true;
ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), indexCharset);
// content need to to sanitized for invalid utf-8 data
byte[] sanitizedContent = {};
try {
ingester.ingest(this, bcs, content.length);
sanitizedContent = this.charsetDecoder.decode(ByteBuffer.wrap(content)).toString().getBytes();
} catch (CharacterCodingException ex) {
throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.charCodingException.msg", parent.getSourceFile().getName()), ex);
}
ByteContentStream bcs = new ByteContentStream(sanitizedContent, sanitizedContent.length, parent.getSourceFile(), indexCharset);
try {
ingester.ingest(this, bcs, sanitizedContent.length);
//logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID);
} catch (Exception ingEx) {
success = false;

View File

@ -157,6 +157,7 @@ DropdownSearchPanel.selectAllMenuItem.text=Select All
DropdownSearchPanel.pasteMenuItem.text=Paste
DropdownSearchPanel.copyMenuItem.text=Copy
AbstractFileChunk.index.exception.msg=Problem ingesting file string chunk\: {0}, chunk\: {1}
AbstractFileChunk.index.charCodingException.msg=Could not sanitize the content of the file: {0}
AbstractFileStringContentStream.getSize.exception.msg=Cannot tell how many chars in converted string, until entire string is converted
AbstractFileStringContentStream.getSrcInfo.text=File\:{0}
ByteContentStream.getSrcInfo.text=File\:{0}