mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-16 17:57:43 +00:00
Merge branch 'develop' of https://github.com/sleuthkit/autopsy into 2197-ProfileOptionsPanel
This commit is contained in:
commit
c1888c88c3
@ -39,6 +39,7 @@ import org.sleuthkit.datamodel.TskCoreException;
|
|||||||
* artifact's attributes.
|
* artifact's attributes.
|
||||||
*/
|
*/
|
||||||
class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
|
class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -91,17 +92,22 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
|
|||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||||
}
|
}
|
||||||
|
|
||||||
private InputStream getInputStream(BlackboardArtifact artifact) {
|
private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException {
|
||||||
// Concatenate the string values of all attributes into a single
|
// Concatenate the string values of all attributes into a single
|
||||||
// "content" string to be indexed.
|
// "content" string to be indexed.
|
||||||
StringBuilder artifactContents = new StringBuilder();
|
StringBuilder artifactContents = new StringBuilder();
|
||||||
|
|
||||||
|
Content dataSource = null;
|
||||||
try {
|
try {
|
||||||
Content dataSource = getDataSource(artifact);
|
dataSource = getDataSource(artifact);
|
||||||
|
} catch (TskCoreException tskCoreException) {
|
||||||
|
throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException);
|
||||||
|
}
|
||||||
if (dataSource == null) {
|
if (dataSource == null) {
|
||||||
return null;
|
throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||||
artifactContents.append(" : ");
|
artifactContents.append(" : ");
|
||||||
@ -119,18 +125,15 @@ class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
|
|||||||
}
|
}
|
||||||
artifactContents.append(System.lineSeparator());
|
artifactContents.append(System.lineSeparator());
|
||||||
}
|
}
|
||||||
} catch (TskCoreException ex) {
|
} catch (TskCoreException tskCoreException) {
|
||||||
logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex);
|
throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException);
|
||||||
return null;
|
|
||||||
}
|
|
||||||
if (artifactContents.length() == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
|
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
|
public Reader getReader(BlackboardArtifact source) throws TextExtractorException {
|
||||||
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,310 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2011-2016 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PushbackReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
import javax.annotation.concurrent.NotThreadSafe;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.TextUtil;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encapsulates the content chunking algorithm in an implementation of the
|
||||||
|
* Iterator interface. Also implements Iterable so it can be used directly in a
|
||||||
|
* for loop. The base chunk is the part of the chunk before the overlapping
|
||||||
|
* window. The window will be included at the end of the current chunk as well
|
||||||
|
* as at the beginning of the next chunk.
|
||||||
|
*/
|
||||||
|
@NotThreadSafe
|
||||||
|
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||||
|
|
||||||
|
//Chunking algorithm paramaters-------------------------------------//
|
||||||
|
/** the maximum size of a chunk, including the window. */
|
||||||
|
private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
|
||||||
|
/** the minimum to read before we start the process of looking for
|
||||||
|
* whitespace to break at and creating an overlapping window. */
|
||||||
|
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
|
||||||
|
/** The maximum size of the chunk, before the overlapping window, even if we
|
||||||
|
* couldn't find whitespace to break at. */
|
||||||
|
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
|
||||||
|
/** The amount of text we will read through before we give up on finding
|
||||||
|
* whitespace to break the chunk/window at. */
|
||||||
|
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
|
||||||
|
/** The number of characters to read in one go from the Reader. */
|
||||||
|
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
|
||||||
|
|
||||||
|
////chunker state--------------------------------------------///
|
||||||
|
/** The Reader that this chunk reads from, and divides into chunks. It must
|
||||||
|
* be a buffered reader to ensure that mark/reset are supported. */
|
||||||
|
private final PushbackReader reader;
|
||||||
|
/** The local buffer of characters read from the Reader. */
|
||||||
|
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
|
||||||
|
|
||||||
|
/** the size in bytes of the chunk (so far). */
|
||||||
|
private int chunkSizeBytes = 0;
|
||||||
|
/** Has the chunker reached the end of the Reader? If so, there are no more
|
||||||
|
* chunks, and the current chunk does not need a window. */
|
||||||
|
private boolean endOfReaderReached = false;
|
||||||
|
/** Store any exception encountered reading from the Reader. */
|
||||||
|
private Exception ex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a Chunker that will chunk the content of the given Reader.
|
||||||
|
*
|
||||||
|
* @param reader The content to chunk.
|
||||||
|
*/
|
||||||
|
Chunker(Reader reader) {
|
||||||
|
//Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
|
||||||
|
this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<Chunk> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Has this Chunker encountered an exception reading from the Reader.
|
||||||
|
*/
|
||||||
|
boolean hasException() {
|
||||||
|
return ex != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the exception encountered reading from the Reader.
|
||||||
|
*
|
||||||
|
* @return The exception, or null if no exception was encountered.
|
||||||
|
*/
|
||||||
|
public Exception getException() {
|
||||||
|
return ex;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return (ex == null)
|
||||||
|
&& (endOfReaderReached == false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
||||||
|
* caret '^'
|
||||||
|
*
|
||||||
|
* @param sb the StringBuilder to sanitize
|
||||||
|
*
|
||||||
|
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
|
||||||
|
* function?
|
||||||
|
*/
|
||||||
|
private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
|
||||||
|
final int length = sb.length();
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
||||||
|
sb.replace(i, i + 1, "^");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Chunk next() {
|
||||||
|
if (hasNext() == false) {
|
||||||
|
throw new NoSuchElementException("There are no more chunks.");
|
||||||
|
}
|
||||||
|
//reset state for the next chunk
|
||||||
|
|
||||||
|
chunkSizeBytes = 0;
|
||||||
|
int baseChunkSizeChars = 0;
|
||||||
|
StringBuilder currentChunk = new StringBuilder();
|
||||||
|
StringBuilder currentWindow = new StringBuilder();
|
||||||
|
|
||||||
|
try {
|
||||||
|
currentChunk.append(readBaseChunk());
|
||||||
|
baseChunkSizeChars = currentChunk.length(); //save the base chunk length
|
||||||
|
currentWindow.append(readWindow());
|
||||||
|
if (endOfReaderReached) {
|
||||||
|
/* if we have reached the end of the content,we won't make
|
||||||
|
* another overlapping chunk, so the length of the base chunk
|
||||||
|
* can be extended to the end. */
|
||||||
|
baseChunkSizeChars = currentChunk.length();
|
||||||
|
} else {
|
||||||
|
/* otherwise we will make another chunk, so unread the window */
|
||||||
|
reader.unread(currentWindow.toString().toCharArray());
|
||||||
|
}
|
||||||
|
} catch (Exception ioEx) {
|
||||||
|
/* Save the exception, which will cause hasNext() to return false,
|
||||||
|
* and break any chunking loop in client code. */
|
||||||
|
ex = ioEx;
|
||||||
|
}
|
||||||
|
//add the window text to the current chunk.
|
||||||
|
currentChunk.append(currentWindow);
|
||||||
|
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
||||||
|
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the base chunk from the reader, attempting to break at whitespace.
|
||||||
|
*
|
||||||
|
* @throws IOException if there is a problem reading from the reader.
|
||||||
|
*/
|
||||||
|
private StringBuilder readBaseChunk() throws IOException {
|
||||||
|
StringBuilder currentChunk = new StringBuilder();
|
||||||
|
//read the chunk until the minimum base chunk size
|
||||||
|
readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
|
||||||
|
|
||||||
|
//keep reading until the maximum base chunk size or white space is reached.
|
||||||
|
readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
|
||||||
|
return currentChunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the window from the reader, attempting to break at whitespace.
|
||||||
|
*
|
||||||
|
* @throws IOException if there is a problem reading from the reader.
|
||||||
|
*/
|
||||||
|
private StringBuilder readWindow() throws IOException {
|
||||||
|
StringBuilder currentWindow = new StringBuilder();
|
||||||
|
//read the window, leaving some room to look for white space to break at.
|
||||||
|
readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
|
||||||
|
|
||||||
|
//keep reading until the max chunk size, or until whitespace is reached.
|
||||||
|
readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
|
||||||
|
return currentWindow;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read until the maxBytes reached, or end of reader.
|
||||||
|
*
|
||||||
|
* @param maxBytes
|
||||||
|
* @param currentSegment
|
||||||
|
*
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
|
||||||
|
int charsRead = 0;
|
||||||
|
//read chars up to maxBytes, or the end of the reader.
|
||||||
|
while ((chunkSizeBytes < maxBytes)
|
||||||
|
&& (endOfReaderReached == false)) {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
endOfReaderReached = true;
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
//if the last char might be part of a surroate pair, unread it.
|
||||||
|
final char lastChar = tempChunkBuf[charsRead - 1];
|
||||||
|
if (Character.isHighSurrogate(lastChar)) {
|
||||||
|
charsRead--;
|
||||||
|
reader.unread(lastChar);
|
||||||
|
}
|
||||||
|
|
||||||
|
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
||||||
|
|
||||||
|
//get the length in bytes of the read chars
|
||||||
|
int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length;
|
||||||
|
|
||||||
|
//if it will not put us past maxBytes
|
||||||
|
if (chunkSizeBytes + segmentSize < maxBytes) {
|
||||||
|
//add it to the chunk
|
||||||
|
currentSegment.append(chunkSegment);
|
||||||
|
chunkSizeBytes += segmentSize;
|
||||||
|
} else {
|
||||||
|
//unread it, and break out of read loop.
|
||||||
|
reader.unread(tempChunkBuf, 0, charsRead);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read until the maxBytes reached, whitespace, or end of reader.
|
||||||
|
*
|
||||||
|
* @param maxBytes
|
||||||
|
* @param currentSegment
|
||||||
|
*
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
|
||||||
|
int charsRead = 0;
|
||||||
|
boolean whitespaceFound = false;
|
||||||
|
//read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
|
||||||
|
while ((chunkSizeBytes < maxBytes)
|
||||||
|
&& (whitespaceFound == false)
|
||||||
|
&& (endOfReaderReached == false)) {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, 1);
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
endOfReaderReached = true;
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
//if the last charcter might be part of a surroate pair, read another char
|
||||||
|
final char ch = tempChunkBuf[0];
|
||||||
|
String chunkSegment;
|
||||||
|
if (Character.isHighSurrogate(ch)) {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 1, 1);
|
||||||
|
if (charsRead == -1) {
|
||||||
|
//this is the last chunk, so include the unpaired surrogate
|
||||||
|
currentSegment.append(ch);
|
||||||
|
chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length;
|
||||||
|
endOfReaderReached = true;
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
//use the surrogate pair in place of the unpaired surrogate.
|
||||||
|
chunkSegment = new String(tempChunkBuf, 0, 2);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//one char
|
||||||
|
chunkSegment = new String(tempChunkBuf, 0, 1);
|
||||||
|
}
|
||||||
|
//check for whitespace.
|
||||||
|
whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0));
|
||||||
|
//add read chars to the chunk and update the length.
|
||||||
|
currentSegment.append(chunkSegment);
|
||||||
|
chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents one chunk as the text in it and the length of the base chunk,
|
||||||
|
* in chars.
|
||||||
|
*/
|
||||||
|
static class Chunk {
|
||||||
|
|
||||||
|
private final StringBuilder sb;
|
||||||
|
private final int chunksize;
|
||||||
|
|
||||||
|
Chunk(StringBuilder sb, int baseChunkLength) {
|
||||||
|
this.sb = sb;
|
||||||
|
this.chunksize = baseChunkLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
int getBaseChunkLength() {
|
||||||
|
return chunksize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -95,7 +95,7 @@ abstract class FileTextExtractor implements TextExtractor< AbstractFile> {
|
|||||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;
|
public abstract Reader getReader(AbstractFile source) throws TextExtractorException;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getID(AbstractFile source) {
|
public long getID(AbstractFile source) {
|
||||||
|
@ -65,7 +65,7 @@ class HtmlTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
|
public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {
|
||||||
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
|
|
||||||
//Parse the stream with Jericho and put the results in a Reader
|
//Parse the stream with Jericho and put the results in a Reader
|
||||||
@ -159,7 +159,7 @@ class HtmlTextExtractor extends FileTextExtractor {
|
|||||||
// All done, now make it a reader
|
// All done, now make it a reader
|
||||||
return new StringReader(stringBuilder.toString());
|
return new StringReader(stringBuilder.toString());
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
throw new Ingester.IngesterException("Error extracting HTML from content.", ex);
|
throw new TextExtractorException("Error extracting HTML from content.", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,22 +18,17 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import com.google.common.base.Utf8;
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import javax.annotation.concurrent.NotThreadSafe;
|
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
import org.apache.solr.common.SolrInputDocument;
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.TextUtil;
|
|
||||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
import org.sleuthkit.datamodel.DerivedFile;
|
import org.sleuthkit.datamodel.DerivedFile;
|
||||||
@ -149,8 +144,8 @@ class Ingester {
|
|||||||
int numChunks = 0; //unknown until chunking is done
|
int numChunks = 0; //unknown until chunking is done
|
||||||
|
|
||||||
if (extractor.isDisabled()) {
|
if (extractor.isDisabled()) {
|
||||||
/* some Extrctors, notable the strings extractor, have options which
|
/* some Extractors, notable the strings extractor, have options
|
||||||
* can be configured such that no extraction should be done */
|
* which can be configured such that no extraction should be done */
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,13 +166,12 @@ class Ingester {
|
|||||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||||
|
|
||||||
throw ingEx; //need to rethrow to signal error and move on
|
throw ingEx; //need to rethrow to signal error and move on
|
||||||
} catch (Exception ex) {
|
|
||||||
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
if (chunker.hasException()) {
|
||||||
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
return false;
|
return false;
|
||||||
@ -192,7 +186,7 @@ class Ingester {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add one chunk as to the Solr index as a seperate sold document.
|
* Add one chunk as to the Solr index as a separate Solr document.
|
||||||
*
|
*
|
||||||
* TODO see if can use a byte or string streaming way to add content to
|
* TODO see if can use a byte or string streaming way to add content to
|
||||||
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
|
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
|
||||||
@ -232,7 +226,7 @@ class Ingester {
|
|||||||
uncommitedIngests = true;
|
uncommitedIngests = true;
|
||||||
|
|
||||||
} catch (KeywordSearchModuleException ex) {
|
} catch (KeywordSearchModuleException ex) {
|
||||||
//JMTODO: does this need to ne internationalized?
|
//JMTODO: does this need to be internationalized?
|
||||||
throw new IngesterException(
|
throw new IngesterException(
|
||||||
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
||||||
}
|
}
|
||||||
@ -370,211 +364,3 @@ class Ingester {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Encapsulates the content chunking algorithm in an implementation of the
|
|
||||||
* Iterator interface. Also implements Iterable so it can be used directly in a
|
|
||||||
* for loop. The base chunk is the part of the chunk before the overlapping
|
|
||||||
* window. The window will be included at the end of the current chunk as well
|
|
||||||
* as at the beginning of the next chunk.
|
|
||||||
*/
|
|
||||||
@NotThreadSafe
|
|
||||||
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|
||||||
|
|
||||||
//Chunking algorithm paramaters-------------------------------------//
|
|
||||||
/** the maximum size of a chunk, including the window. */
|
|
||||||
private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
|
|
||||||
/** the minimum to read before we start the process of looking for
|
|
||||||
* whitespace to break at and creating an overlapping window. */
|
|
||||||
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
|
|
||||||
/** The maximum size of the chunk, before the overlapping window, even if we
|
|
||||||
* couldn't find whitespace to break at. */
|
|
||||||
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
|
|
||||||
/** The amount of text we will read through before we give up on finding
|
|
||||||
* whitespace to break the chunk/window at. */
|
|
||||||
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
|
|
||||||
/** The number of characters to read in one go from the Reader. */
|
|
||||||
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
|
|
||||||
|
|
||||||
////chunker state--------------------------------------------///
|
|
||||||
/** The Reader that this chunk reads from, and divides into chunks. It must
|
|
||||||
* be a buffered reader to ensure that mark/reset are supported. */
|
|
||||||
private final BufferedReader reader;
|
|
||||||
/** The local buffer of characters read from the Reader. */
|
|
||||||
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
|
|
||||||
/** number of chars read in the most recent read operation. */
|
|
||||||
private int charsRead = 0;
|
|
||||||
|
|
||||||
/** The text of the current chunk (so far). */
|
|
||||||
private StringBuilder currentChunk;
|
|
||||||
/** the size in bytes of the chunk (so far). */
|
|
||||||
private int chunkSizeBytes = 0;
|
|
||||||
/** the size in chars of the (base) chunk (so far). */
|
|
||||||
private int baseChunkSizeChars;
|
|
||||||
|
|
||||||
/** has the chunker found whitespace to break on? */
|
|
||||||
private boolean whitespaceFound = false;
|
|
||||||
/** has the chunker reached the end of the Reader? If so, there are no more
|
|
||||||
* chunks, and the current chunk does not need a window. */
|
|
||||||
private boolean endOfReaderReached = false;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a Chunker that will chunk the content of the given Reader.
|
|
||||||
*
|
|
||||||
* @param reader The content to chunk.
|
|
||||||
*/
|
|
||||||
Chunker(BufferedReader reader) {
|
|
||||||
this.reader = reader;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Iterator<Chunk> iterator() {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
return endOfReaderReached == false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
|
||||||
* caret '^'
|
|
||||||
*
|
|
||||||
* @param sb the StringBuilder to sanitize
|
|
||||||
*
|
|
||||||
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
|
|
||||||
* function?
|
|
||||||
*/
|
|
||||||
private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
|
|
||||||
final int length = sb.length();
|
|
||||||
for (int i = 0; i < length; i++) {
|
|
||||||
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
|
||||||
sb.replace(i, i + 1, "^");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sb;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Chunk next() {
|
|
||||||
if (endOfReaderReached) {
|
|
||||||
throw new NoSuchElementException("There are no more chunks.");
|
|
||||||
}
|
|
||||||
//reset state for the next chunk
|
|
||||||
currentChunk = new StringBuilder();
|
|
||||||
chunkSizeBytes = 0;
|
|
||||||
baseChunkSizeChars = 0;
|
|
||||||
|
|
||||||
try {
|
|
||||||
readBaseChunk();
|
|
||||||
baseChunkSizeChars = currentChunk.length();
|
|
||||||
reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk
|
|
||||||
readWindow();
|
|
||||||
} catch (IOException ioEx) {
|
|
||||||
throw new RuntimeException("IOException while reading chunk.", ioEx);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
reader.reset(); //reset the reader the so the next chunk can begin at the position marked above
|
|
||||||
} catch (IOException ex) {
|
|
||||||
throw new RuntimeException("IOException while resetting chunk reader.", ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (endOfReaderReached) {
|
|
||||||
/* if we have reached the end of the content,we won't make another
|
|
||||||
* overlapping chunk, so the base chunk can be extended to the end. */
|
|
||||||
baseChunkSizeChars = currentChunk.length();
|
|
||||||
}
|
|
||||||
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
|
||||||
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read the base chunk from the reader, and attempt to break at whitespace.
|
|
||||||
*
|
|
||||||
* @throws IOException if there is a problem reading from the reader.
|
|
||||||
*/
|
|
||||||
private void readBaseChunk() throws IOException {
|
|
||||||
//read the chunk until the minimum base chunk size
|
|
||||||
readHelper(MINIMUM_BASE_CHUNK_SIZE, false);
|
|
||||||
//keep reading until the maximum base chunk size or white space is reached.
|
|
||||||
whitespaceFound = false;
|
|
||||||
readHelper(MAXIMUM_BASE_CHUNK_SIZE, true);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read the window from the reader, and attempt to break at whitespace.
|
|
||||||
*
|
|
||||||
* @throws IOException if there is a problem reading from the reader.
|
|
||||||
*/
|
|
||||||
private void readWindow() throws IOException {
|
|
||||||
//read the window, leaving some room to look for white space to break at.
|
|
||||||
int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
|
|
||||||
readHelper(windowEnd, false);
|
|
||||||
whitespaceFound = false;
|
|
||||||
//keep reading until the max chunk size, or until whitespace is reached.
|
|
||||||
windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024);
|
|
||||||
readHelper(windowEnd, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Helper method that implements reading in a loop.
|
|
||||||
*
|
|
||||||
* @param maxBytes The max cummulative length of the content,in
|
|
||||||
* bytes, to read from the Reader. That is, when
|
|
||||||
* chunkSizeBytes >= maxBytes stop reading.
|
|
||||||
* @param inWhiteSpaceBuffer Should the current read stop once whitespace is
|
|
||||||
* found?
|
|
||||||
*
|
|
||||||
* @throws IOException If there is a problem reading from the Reader.
|
|
||||||
*/
|
|
||||||
private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException {
|
|
||||||
//only read one character at a time if we are looking for whitespace.
|
|
||||||
final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE;
|
|
||||||
|
|
||||||
//read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader.
|
|
||||||
while ((chunkSizeBytes < maxBytes)
|
|
||||||
&& (false == (inWhiteSpaceBuffer && whitespaceFound))
|
|
||||||
&& (endOfReaderReached == false)) {
|
|
||||||
charsRead = reader.read(tempChunkBuf, 0, readSize);
|
|
||||||
if (-1 == charsRead) {
|
|
||||||
//this is the last chunk
|
|
||||||
endOfReaderReached = true;
|
|
||||||
} else {
|
|
||||||
if (inWhiteSpaceBuffer) {
|
|
||||||
//chec for whitespace.
|
|
||||||
whitespaceFound = Character.isWhitespace(tempChunkBuf[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
//add read chars to the chunk and update the length.
|
|
||||||
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
|
||||||
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
|
||||||
currentChunk.append(chunkSegment);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Represents one chunk as the text in it and the length of the base chunk, in
|
|
||||||
* chars.
|
|
||||||
*/
|
|
||||||
class Chunk {
|
|
||||||
|
|
||||||
private final StringBuilder sb;
|
|
||||||
private final int chunksize;
|
|
||||||
|
|
||||||
Chunk(StringBuilder sb, int baseChunkLength) {
|
|
||||||
this.sb = sb;
|
|
||||||
this.chunksize = baseChunkLength;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
int getBaseChunkLength() {
|
|
||||||
return chunksize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -107,7 +107,7 @@ class StringsTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
|
public InputStreamReader getReader(AbstractFile sourceFile) throws TextExtractorException {
|
||||||
InputStream stringStream = getInputStream(sourceFile);
|
InputStream stringStream = getInputStream(sourceFile);
|
||||||
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,6 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
|||||||
*/
|
*/
|
||||||
interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is this extractor configured such that no extraction will/should be done?
|
* Is this extractor configured such that no extraction will/should be done?
|
||||||
*
|
*
|
||||||
@ -46,7 +45,6 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
|||||||
*/
|
*/
|
||||||
abstract void logWarning(String msg, Exception ex);
|
abstract void logWarning(String msg, Exception ex);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get a reader that over the text extracted from the given source.
|
* Get a reader that over the text extracted from the given source.
|
||||||
*
|
*
|
||||||
@ -57,7 +55,7 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
|||||||
*
|
*
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
abstract Reader getReader(TextSource source) throws Ingester.IngesterException;
|
abstract Reader getReader(TextSource source) throws TextExtractorException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the 'object' id of the given source.
|
* Get the 'object' id of the given source.
|
||||||
@ -76,4 +74,15 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
|||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
abstract String getName(TextSource source);
|
abstract String getName(TextSource source);
|
||||||
|
|
||||||
|
class TextExtractorException extends Exception {
|
||||||
|
|
||||||
|
public TextExtractorException(String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TextExtractorException(String message, Throwable cause) {
|
||||||
|
super(message, cause);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,6 @@ import org.apache.tika.metadata.Metadata;
|
|||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
@ -67,7 +66,7 @@ class TikaTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
public Reader getReader(AbstractFile sourceFile) throws TextExtractorException, MissingResourceException {
|
||||||
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
|
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
@ -81,12 +80,12 @@ class TikaTextExtractor extends FileTextExtractor {
|
|||||||
} catch (TimeoutException te) {
|
} catch (TimeoutException te) {
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
|
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
|
||||||
logWarning(msg, te);
|
logWarning(msg, te);
|
||||||
throw new IngesterException(msg);
|
throw new TextExtractorException(msg, te);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
|
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
|
||||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
|
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
|
||||||
logWarning(msg, ex);
|
logWarning(msg, ex);
|
||||||
throw new IngesterException(msg, ex);
|
throw new TextExtractorException(msg, ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user