mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-14 17:06:16 +00:00
Merge pull request #1463 from sidheshenator/sanitize_utf-8-solr
invalid utf-8 data sanitized before feeding to the Solr Ingester
This commit is contained in:
commit
1eb39955fc
@ -19,9 +19,15 @@
|
||||
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.logging.Level;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
|
||||
/**
|
||||
@ -30,10 +36,16 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
class AbstractFileChunk {
|
||||
private int chunkID;
|
||||
private TextExtractor parent;
|
||||
private final CharsetDecoder charsetDecoder = Charset.forName("utf-8").newDecoder(); // NON-NLS
|
||||
private final String replacement_string = "?";
|
||||
private static final Logger logger = Logger.getLogger(AbstractFileChunk.class.getName());
|
||||
|
||||
AbstractFileChunk(TextExtractor parent, int chunkID) {
|
||||
this.parent = parent;
|
||||
this.chunkID = chunkID;
|
||||
this.charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||
this.charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
|
||||
this.charsetDecoder.replaceWith(replacement_string); // white questionmark in black diamond - Replacement Character U+FFFD
|
||||
}
|
||||
|
||||
public TextExtractor getParent() {
|
||||
@ -55,9 +67,16 @@ class AbstractFileChunk {
|
||||
|
||||
public boolean index(Ingester ingester, byte[] content, long contentSize, Charset indexCharset) throws IngesterException {
|
||||
boolean success = true;
|
||||
ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), indexCharset);
|
||||
// content need to to sanitized for invalid utf-8 data
|
||||
byte[] sanitizedContent = {};
|
||||
try {
|
||||
ingester.ingest(this, bcs, content.length);
|
||||
sanitizedContent = this.charsetDecoder.decode(ByteBuffer.wrap(content)).toString().getBytes();
|
||||
} catch (CharacterCodingException ex) {
|
||||
throw new IngesterException(NbBundle.getMessage(this.getClass(), "AbstractFileChunk.index.charCodingException.msg", parent.getSourceFile().getName()), ex);
|
||||
}
|
||||
ByteContentStream bcs = new ByteContentStream(sanitizedContent, sanitizedContent.length, parent.getSourceFile(), indexCharset);
|
||||
try {
|
||||
ingester.ingest(this, bcs, sanitizedContent.length);
|
||||
//logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID);
|
||||
} catch (Exception ingEx) {
|
||||
success = false;
|
||||
|
@ -157,6 +157,7 @@ DropdownSearchPanel.selectAllMenuItem.text=Select All
|
||||
DropdownSearchPanel.pasteMenuItem.text=Paste
|
||||
DropdownSearchPanel.copyMenuItem.text=Copy
|
||||
AbstractFileChunk.index.exception.msg=Problem ingesting file string chunk\: {0}, chunk\: {1}
|
||||
AbstractFileChunk.index.charCodingException.msg=Could not sanitize the content of the file: {0}
|
||||
AbstractFileStringContentStream.getSize.exception.msg=Cannot tell how many chars in converted string, until entire string is converted
|
||||
AbstractFileStringContentStream.getSrcInfo.text=File\:{0}
|
||||
ByteContentStream.getSrcInfo.text=File\:{0}
|
||||
|
Loading…
x
Reference in New Issue
Block a user