From cfc965917ed27dadcfb4af0dff3a29eab17b866d Mon Sep 17 00:00:00 2001 From: sidheshenator Date: Fri, 24 Jul 2015 09:26:18 -0400 Subject: [PATCH 1/3] invalid utf-8 data sanitized before feeding to the Solr Ingester --- .../keywordsearch/AbstractFileChunk.java | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java index fd6724e7ad..7403a59ddf 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java @@ -19,9 +19,14 @@ package org.sleuthkit.autopsy.keywordsearch; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; - -import org.openide.util.NbBundle; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.logging.Level; +import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; /** @@ -30,10 +35,16 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; class AbstractFileChunk { private int chunkID; private TextExtractor parent; + private final CharsetDecoder de = Charset.forName("utf-8").newDecoder(); + private final String replacement_string = "?"; + private static final Logger logger = Logger.getLogger(AbstractFileChunk.class.getName()); AbstractFileChunk(TextExtractor parent, int chunkID) { this.parent = parent; this.chunkID = chunkID; + this.de.onMalformedInput(CodingErrorAction.REPLACE); + this.de.onUnmappableCharacter(CodingErrorAction.REPLACE); + this.de.replaceWith(replacement_string); // white questionmark in black diamond - Replacement Character U+FFFD } public TextExtractor getParent() { @@ -55,14 +66,22 @@ class AbstractFileChunk { public boolean index(Ingester ingester, byte[] content, long contentSize, Charset indexCharset) throws IngesterException { boolean success = true; - ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), indexCharset); + // content need to to sanitized for invalid utf-8 data + CharBuffer decodedCB; + byte[] decodedContent = {}; try { - ingester.ingest(this, bcs, content.length); + decodedCB = this.de.decode(ByteBuffer.wrap(content)); + decodedContent = decodedCB.toString().getBytes(); + } catch (CharacterCodingException ex) { + logger.log(Level.WARNING, "Error encoding the content: " + ByteBuffer.wrap(content).toString(), ex); + } + ByteContentStream bcs = new ByteContentStream(decodedContent, contentSize, parent.getSourceFile(), indexCharset); + try { + ingester.ingest(this, bcs, decodedContent.length); //logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID); } catch (Exception ingEx) { success = false; - throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.exception.msg", - parent.getSourceFile().getId(), chunkID), ingEx); + throw ingEx; } return success; } From 9a9255b2a870f4cfe5e5dcde6c2555ba7ed13259 Mon Sep 17 00:00:00 2001 From: sidheshenator Date: Mon, 27 Jul 2015 14:32:08 -0400 Subject: [PATCH 2/3] AbstractFileChunk sanitization code refactored --- .../keywordsearch/AbstractFileChunk.java | 25 ++++++++++--------- .../autopsy/keywordsearch/Bundle.properties | 1 + 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java index 7403a59ddf..f86cfb519e 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java @@ -26,6 +26,7 @@ import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.util.logging.Level; +import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; @@ -35,16 +36,16 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; class AbstractFileChunk { private int chunkID; private TextExtractor parent; - private final CharsetDecoder de = Charset.forName("utf-8").newDecoder(); + private final CharsetDecoder charsetDecoder = Charset.forName("utf-8").newDecoder(); // NON-NLS private final String replacement_string = "?"; private static final Logger logger = Logger.getLogger(AbstractFileChunk.class.getName()); AbstractFileChunk(TextExtractor parent, int chunkID) { this.parent = parent; this.chunkID = chunkID; - this.de.onMalformedInput(CodingErrorAction.REPLACE); - this.de.onUnmappableCharacter(CodingErrorAction.REPLACE); - this.de.replaceWith(replacement_string); // white questionmark in black diamond - Replacement Character U+FFFD + this.charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); + this.charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + this.charsetDecoder.replaceWith(replacement_string); // white questionmark in black diamond - Replacement Character U+FFFD } public TextExtractor getParent() { @@ -67,21 +68,21 @@ class AbstractFileChunk { public boolean index(Ingester ingester, byte[] content, long contentSize, Charset indexCharset) throws IngesterException { boolean success = true; // content need to to sanitized for invalid utf-8 data - CharBuffer decodedCB; - byte[] decodedContent = {}; + byte[] sanitizedContent = {}; try { - decodedCB = this.de.decode(ByteBuffer.wrap(content)); - decodedContent = decodedCB.toString().getBytes(); + sanitizedContent = this.charsetDecoder.decode(ByteBuffer.wrap(content)).toString().getBytes(); } catch (CharacterCodingException ex) { - logger.log(Level.WARNING, "Error encoding the content: " + ByteBuffer.wrap(content).toString(), ex); + logger.log(Level.WARNING, "Error sanitizing the " + ByteBuffer.wrap(content).toString(), ex); + throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.charCodingException.msg", parent.getSourceFile().getName()), ex); } - ByteContentStream bcs = new ByteContentStream(decodedContent, contentSize, parent.getSourceFile(), indexCharset); + ByteContentStream bcs = new ByteContentStream(sanitizedContent, sanitizedContent.length, parent.getSourceFile(), indexCharset); try { - ingester.ingest(this, bcs, decodedContent.length); + ingester.ingest(this, bcs, sanitizedContent.length); //logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID); } catch (Exception ingEx) { success = false; - throw ingEx; + throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.exception.msg", + parent.getSourceFile().getId(), chunkID), ingEx); } return success; } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties index 70a2e74a7c..f1d1db81e4 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties @@ -156,6 +156,7 @@ DropdownSearchPanel.selectAllMenuItem.text=Select All DropdownSearchPanel.pasteMenuItem.text=Paste DropdownSearchPanel.copyMenuItem.text=Copy AbstractFileChunk.index.exception.msg=Problem ingesting file string chunk\: {0}, chunk\: {1} +AbstractFileChunk.index.charCodingException.msg=Could not sanitize the content of the file: {0} AbstractFileStringContentStream.getSize.exception.msg=Cannot tell how many chars in converted string, until entire string is converted AbstractFileStringContentStream.getSrcInfo.text=File\:{0} ByteContentStream.getSrcInfo.text=File\:{0} From d75ad1b21f92297345c80d80c8f60c36d5f8287f Mon Sep 17 00:00:00 2001 From: Sidhesh Mhatre Date: Thu, 30 Jul 2015 10:48:55 -0400 Subject: [PATCH 3/3] Avoid duplicate logging of propagated exception. --- .../org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java | 1 - 1 file changed, 1 deletion(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java index f86cfb519e..81e200108e 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java @@ -72,7 +72,6 @@ class AbstractFileChunk { try { sanitizedContent = this.charsetDecoder.decode(ByteBuffer.wrap(content)).toString().getBytes(); } catch (CharacterCodingException ex) { - logger.log(Level.WARNING, "Error sanitizing the " + ByteBuffer.wrap(content).toString(), ex); throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.charCodingException.msg", parent.getSourceFile().getName()), ex); } ByteContentStream bcs = new ByteContentStream(sanitizedContent, sanitizedContent.length, parent.getSourceFile(), indexCharset);