mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Merge pull request #2502 from millmanorama/2259-sanitize-utf16-with-Charset-encoder
2259 sanitize utf16 with charset encoder
This commit is contained in:
commit
f9c2bcadc6
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Autopsy Forensic Browser
|
* Autopsy Forensic Browser
|
||||||
*
|
*
|
||||||
* Copyright 2011-2016 Basis Technology Corp.
|
* Copyright 2011-2017 Basis Technology Corp.
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -21,6 +21,7 @@ package org.sleuthkit.autopsy.keywordsearch;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PushbackReader;
|
import java.io.PushbackReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
@ -38,6 +39,10 @@ import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
|
|||||||
@NotThreadSafe
|
@NotThreadSafe
|
||||||
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||||
|
|
||||||
|
//local references to standard encodings
|
||||||
|
private static final Charset UTF_16 = StandardCharsets.UTF_16;
|
||||||
|
private static final Charset UTF_8 = StandardCharsets.UTF_8;
|
||||||
|
|
||||||
//Chunking algorithm paramaters-------------------------------------//
|
//Chunking algorithm paramaters-------------------------------------//
|
||||||
/** the maximum size of a chunk, including the window. */
|
/** the maximum size of a chunk, including the window. */
|
||||||
private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
|
private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
|
||||||
@ -84,7 +89,10 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Has this Chunker encountered an exception reading from the Reader.
|
* Has this Chunker encountered an exception reading from the Reader?
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @return True if this Chunker encountered an exception.
|
||||||
*/
|
*/
|
||||||
boolean hasException() {
|
boolean hasException() {
|
||||||
return ex != null;
|
return ex != null;
|
||||||
@ -124,6 +132,25 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
return sb;
|
return sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleanup invalid codepoint sequences by replacing them with the default
|
||||||
|
* replacement character: U+FFFD / <EFBFBD>.
|
||||||
|
*
|
||||||
|
* @param s The string to cleanup.
|
||||||
|
*
|
||||||
|
* @return A StringBuilder with the same content as s but where all invalid
|
||||||
|
* code * points have been replaced.
|
||||||
|
*/
|
||||||
|
private static StringBuilder replaceInvalidUTF16(String s) {
|
||||||
|
/* encode the string to UTF-16 which does the replcement, see
|
||||||
|
* Charset.encode(), then decode back to a StringBuilder. */
|
||||||
|
return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static StringBuilder sanitize(String s) {
|
||||||
|
return sanitizeToUTF8(replaceInvalidUTF16(s));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Chunk next() {
|
public Chunk next() {
|
||||||
if (hasNext() == false) {
|
if (hasNext() == false) {
|
||||||
@ -157,7 +184,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
//add the window text to the current chunk.
|
//add the window text to the current chunk.
|
||||||
currentChunk.append(currentWindow);
|
currentChunk.append(currentWindow);
|
||||||
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
||||||
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
|
return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -216,10 +243,11 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
reader.unread(lastChar);
|
reader.unread(lastChar);
|
||||||
}
|
}
|
||||||
|
|
||||||
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
//cleanup any invalid utf-16 sequences
|
||||||
|
StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));
|
||||||
|
|
||||||
//get the length in bytes of the read chars
|
//get the length in utf8 bytes of the read chars
|
||||||
int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length;
|
int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
|
||||||
|
|
||||||
//if it will not put us past maxBytes
|
//if it will not put us past maxBytes
|
||||||
if (chunkSizeBytes + segmentSize < maxBytes) {
|
if (chunkSizeBytes + segmentSize < maxBytes) {
|
||||||
@ -239,11 +267,11 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
* Read until the maxBytes reached, whitespace, or end of reader.
|
* Read until the maxBytes reached, whitespace, or end of reader.
|
||||||
*
|
*
|
||||||
* @param maxBytes
|
* @param maxBytes
|
||||||
* @param currentSegment
|
* @param currentChunk
|
||||||
*
|
*
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
|
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
|
||||||
int charsRead = 0;
|
int charsRead = 0;
|
||||||
boolean whitespaceFound = false;
|
boolean whitespaceFound = false;
|
||||||
//read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
|
//read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
|
||||||
@ -260,26 +288,28 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
final char ch = tempChunkBuf[0];
|
final char ch = tempChunkBuf[0];
|
||||||
String chunkSegment;
|
String chunkSegment;
|
||||||
if (Character.isHighSurrogate(ch)) {
|
if (Character.isHighSurrogate(ch)) {
|
||||||
|
//read another char into the buffer.
|
||||||
charsRead = reader.read(tempChunkBuf, 1, 1);
|
charsRead = reader.read(tempChunkBuf, 1, 1);
|
||||||
if (charsRead == -1) {
|
if (charsRead == -1) {
|
||||||
//this is the last chunk, so include the unpaired surrogate
|
//this is the last chunk, so just drop the unpaired surrogate
|
||||||
currentSegment.append(ch);
|
|
||||||
chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length;
|
|
||||||
endOfReaderReached = true;
|
endOfReaderReached = true;
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
//use the surrogate pair in place of the unpaired surrogate.
|
//try to use the pair together.
|
||||||
chunkSegment = new String(tempChunkBuf, 0, 2);
|
chunkSegment = new String(tempChunkBuf, 0, 2);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//one char
|
//one char
|
||||||
chunkSegment = new String(tempChunkBuf, 0, 1);
|
chunkSegment = new String(tempChunkBuf, 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//cleanup any invalid utf-16 sequences
|
||||||
|
StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
|
||||||
//check for whitespace.
|
//check for whitespace.
|
||||||
whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0));
|
whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
|
||||||
//add read chars to the chunk and update the length.
|
//add read chars to the chunk and update the length.
|
||||||
currentSegment.append(chunkSegment);
|
currentChunk.append(sanitizedChunkSegment);
|
||||||
chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length;
|
chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -291,20 +321,41 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
static class Chunk {
|
static class Chunk {
|
||||||
|
|
||||||
private final StringBuilder sb;
|
private final StringBuilder sb;
|
||||||
private final int chunksize;
|
private final int baseChunkSizeChars;
|
||||||
|
private final int chunkSizeBytes;
|
||||||
|
|
||||||
Chunk(StringBuilder sb, int baseChunkLength) {
|
Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
|
||||||
this.sb = sb;
|
this.sb = sb;
|
||||||
this.chunksize = baseChunkLength;
|
this.baseChunkSizeChars = baseChunkSizeChars;
|
||||||
|
this.chunkSizeBytes = chunkSizeBytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the content of the chunk.
|
||||||
|
*
|
||||||
|
* @return The content of the chunk.
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the size in bytes of the utf-8 encoding of the entire chunk.
|
||||||
|
*
|
||||||
|
* @return the size in bytes of the utf-8 encoding of the entire chunk
|
||||||
|
*/
|
||||||
|
public int getChunkSizeBytes() {
|
||||||
|
return chunkSizeBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the length of the base chunk in java chars.
|
||||||
|
*
|
||||||
|
* @return the length of the base chunk in java chars.
|
||||||
|
*/
|
||||||
int getBaseChunkLength() {
|
int getBaseChunkLength() {
|
||||||
return chunksize;
|
return baseChunkSizeChars;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Autopsy Forensic Browser
|
* Autopsy Forensic Browser
|
||||||
*
|
*
|
||||||
* Copyright 2011-2016 Basis Technology Corp.
|
* Copyright 2011-2017 Basis Technology Corp.
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -177,7 +177,10 @@ class Ingester {
|
|||||||
} finally {
|
} finally {
|
||||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||||
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
|
//reset id field to base document id
|
||||||
|
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
|
||||||
|
//"parent" docs don't have chunk_size
|
||||||
|
fields.remove(Server.Schema.CHUNK_SIZE.toString());
|
||||||
indexChunk(null, sourceName, fields);
|
indexChunk(null, sourceName, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user