diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java index fd6724e7ad..81e200108e 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java @@ -19,9 +19,15 @@ package org.sleuthkit.autopsy.keywordsearch; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; - +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.logging.Level; import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; /** @@ -30,10 +36,16 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; class AbstractFileChunk { private int chunkID; private TextExtractor parent; + private final CharsetDecoder charsetDecoder = Charset.forName("utf-8").newDecoder(); // NON-NLS + private final String replacement_string = "?"; + private static final Logger logger = Logger.getLogger(AbstractFileChunk.class.getName()); AbstractFileChunk(TextExtractor parent, int chunkID) { this.parent = parent; this.chunkID = chunkID; + this.charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); + this.charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + this.charsetDecoder.replaceWith(replacement_string); // white questionmark in black diamond - Replacement Character U+FFFD } public TextExtractor getParent() { @@ -55,9 +67,16 @@ class AbstractFileChunk { public boolean index(Ingester ingester, byte[] content, long contentSize, Charset indexCharset) throws IngesterException { boolean success = true; - ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), indexCharset); + // content need to to sanitized for invalid utf-8 data + byte[] sanitizedContent = {}; try { - ingester.ingest(this, bcs, content.length); + sanitizedContent = this.charsetDecoder.decode(ByteBuffer.wrap(content)).toString().getBytes(); + } catch (CharacterCodingException ex) { + throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.charCodingException.msg", parent.getSourceFile().getName()), ex); + } + ByteContentStream bcs = new ByteContentStream(sanitizedContent, sanitizedContent.length, parent.getSourceFile(), indexCharset); + try { + ingester.ingest(this, bcs, sanitizedContent.length); //logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID); } catch (Exception ingEx) { success = false; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties index 7c1dc087cb..a838f203bc 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties @@ -157,6 +157,7 @@ DropdownSearchPanel.selectAllMenuItem.text=Select All DropdownSearchPanel.pasteMenuItem.text=Paste DropdownSearchPanel.copyMenuItem.text=Copy AbstractFileChunk.index.exception.msg=Problem ingesting file string chunk\: {0}, chunk\: {1} +AbstractFileChunk.index.charCodingException.msg=Could not sanitize the content of the file: {0} AbstractFileStringContentStream.getSize.exception.msg=Cannot tell how many chars in converted string, until entire string is converted AbstractFileStringContentStream.getSrcInfo.text=File\:{0} ByteContentStream.getSrcInfo.text=File\:{0}