diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java index 07657f9646..962e5ba245 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java @@ -39,6 +39,7 @@ import org.sleuthkit.datamodel.TskCoreException; * artifact's attributes. */ class ArtifactTextExtractor implements TextExtractor { + static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName()); /** @@ -82,26 +83,31 @@ class ArtifactTextExtractor implements TextExtractor { } @Override - public boolean isDisabled() { + public boolean isDisabled() { return false; - } + } - @Override - public void logWarning(final String msg, Exception ex) { + @Override + public void logWarning(final String msg, Exception ex) { logger.log(Level.WARNING, msg, ex); //NON-NLS } } - private InputStream getInputStream(BlackboardArtifact artifact) { + private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException { // Concatenate the string values of all attributes into a single // "content" string to be indexed. StringBuilder artifactContents = new StringBuilder(); + Content dataSource = null; try { - Content dataSource = getDataSource(artifact); - if (dataSource == null) { - return null; - } + dataSource = getDataSource(artifact); + } catch (TskCoreException tskCoreException) { + throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException); + } + if (dataSource == null) { + throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString()); + } + try { for (BlackboardAttribute attribute : artifact.getAttributes()) { artifactContents.append(attribute.getAttributeType().getDisplayName()); artifactContents.append(" : "); @@ -119,18 +125,15 @@ class ArtifactTextExtractor implements TextExtractor { } artifactContents.append(System.lineSeparator()); } - } catch (TskCoreException ex) { - logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex); - return null; - } - if (artifactContents.length() == 0) { - return null; + } catch (TskCoreException tskCoreException) { + throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException); } + return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8); } @Override - public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException { + public Reader getReader(BlackboardArtifact source) throws TextExtractorException { return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java new file mode 100644 index 0000000000..3386472d03 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -0,0 +1,310 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2016 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import java.io.IOException; +import java.io.PushbackReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.NoSuchElementException; +import javax.annotation.concurrent.NotThreadSafe; +import org.sleuthkit.autopsy.coreutils.TextUtil; +import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; + +/** + * Encapsulates the content chunking algorithm in an implementation of the + * Iterator interface. Also implements Iterable so it can be used directly in a + * for loop. The base chunk is the part of the chunk before the overlapping + * window. The window will be included at the end of the current chunk as well + * as at the beginning of the next chunk. + */ +@NotThreadSafe +class Chunker implements Iterator, Iterable { + + //Chunking algorithm paramaters-------------------------------------// + /** the maximum size of a chunk, including the window. */ + private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes + /** the minimum to read before we start the process of looking for + * whitespace to break at and creating an overlapping window. */ + private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes + /** The maximum size of the chunk, before the overlapping window, even if we + * couldn't find whitespace to break at. */ + private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes + /** The amount of text we will read through before we give up on finding + * whitespace to break the chunk/window at. */ + private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes + /** The number of characters to read in one go from the Reader. */ + private static final int READ_CHARS_BUFFER_SIZE = 512; //chars + + ////chunker state--------------------------------------------/// + /** The Reader that this chunk reads from, and divides into chunks. It must + * be a buffered reader to ensure that mark/reset are supported. */ + private final PushbackReader reader; + /** The local buffer of characters read from the Reader. */ + private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE]; + + /** the size in bytes of the chunk (so far). */ + private int chunkSizeBytes = 0; + /** Has the chunker reached the end of the Reader? If so, there are no more + * chunks, and the current chunk does not need a window. */ + private boolean endOfReaderReached = false; + /** Store any exception encountered reading from the Reader. */ + private Exception ex; + + /** + * Create a Chunker that will chunk the content of the given Reader. + * + * @param reader The content to chunk. + */ + Chunker(Reader reader) { + //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill. + this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE); + } + + @Override + public Iterator iterator() { + return this; + } + + /** + * Has this Chunker encountered an exception reading from the Reader. + */ + boolean hasException() { + return ex != null; + } + + /** + * Get the exception encountered reading from the Reader. + * + * @return The exception, or null if no exception was encountered. + */ + public Exception getException() { + return ex; + } + + @Override + public boolean hasNext() { + return (ex == null) + && (endOfReaderReached == false); + } + + /** + * Sanitize the given StringBuilder by replacing non-UTF-8 characters with + * caret '^' + * + * @param sb the StringBuilder to sanitize + * + * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping + * function? + */ + private static StringBuilder sanitizeToUTF8(StringBuilder sb) { + final int length = sb.length(); + for (int i = 0; i < length; i++) { + if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) { + sb.replace(i, i + 1, "^"); + } + } + return sb; + } + + @Override + public Chunk next() { + if (hasNext() == false) { + throw new NoSuchElementException("There are no more chunks."); + } + //reset state for the next chunk + + chunkSizeBytes = 0; + int baseChunkSizeChars = 0; + StringBuilder currentChunk = new StringBuilder(); + StringBuilder currentWindow = new StringBuilder(); + + try { + currentChunk.append(readBaseChunk()); + baseChunkSizeChars = currentChunk.length(); //save the base chunk length + currentWindow.append(readWindow()); + if (endOfReaderReached) { + /* if we have reached the end of the content,we won't make + * another overlapping chunk, so the length of the base chunk + * can be extended to the end. */ + baseChunkSizeChars = currentChunk.length(); + } else { + /* otherwise we will make another chunk, so unread the window */ + reader.unread(currentWindow.toString().toCharArray()); + } + } catch (Exception ioEx) { + /* Save the exception, which will cause hasNext() to return false, + * and break any chunking loop in client code. */ + ex = ioEx; + } + //add the window text to the current chunk. + currentChunk.append(currentWindow); + //sanitize the text and return a Chunk object, that includes the base chunk length. + return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); + } + + /** + * Read the base chunk from the reader, attempting to break at whitespace. + * + * @throws IOException if there is a problem reading from the reader. + */ + private StringBuilder readBaseChunk() throws IOException { + StringBuilder currentChunk = new StringBuilder(); + //read the chunk until the minimum base chunk size + readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk); + + //keep reading until the maximum base chunk size or white space is reached. + readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk); + return currentChunk; + } + + /** + * Read the window from the reader, attempting to break at whitespace. + * + * @throws IOException if there is a problem reading from the reader. + */ + private StringBuilder readWindow() throws IOException { + StringBuilder currentWindow = new StringBuilder(); + //read the window, leaving some room to look for white space to break at. + readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow); + + //keep reading until the max chunk size, or until whitespace is reached. + readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow); + return currentWindow; + } + + /** + * Read until the maxBytes reached, or end of reader. + * + * @param maxBytes + * @param currentSegment + * + * @throws IOException + */ + private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException { + int charsRead = 0; + //read chars up to maxBytes, or the end of the reader. + while ((chunkSizeBytes < maxBytes) + && (endOfReaderReached == false)) { + charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE); + if (-1 == charsRead) { + //this is the last chunk + endOfReaderReached = true; + return; + } else { + //if the last char might be part of a surroate pair, unread it. + final char lastChar = tempChunkBuf[charsRead - 1]; + if (Character.isHighSurrogate(lastChar)) { + charsRead--; + reader.unread(lastChar); + } + + String chunkSegment = new String(tempChunkBuf, 0, charsRead); + + //get the length in bytes of the read chars + int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length; + + //if it will not put us past maxBytes + if (chunkSizeBytes + segmentSize < maxBytes) { + //add it to the chunk + currentSegment.append(chunkSegment); + chunkSizeBytes += segmentSize; + } else { + //unread it, and break out of read loop. + reader.unread(tempChunkBuf, 0, charsRead); + return; + } + } + } + } + + /** + * Read until the maxBytes reached, whitespace, or end of reader. + * + * @param maxBytes + * @param currentSegment + * + * @throws IOException + */ + private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException { + int charsRead = 0; + boolean whitespaceFound = false; + //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader. + while ((chunkSizeBytes < maxBytes) + && (whitespaceFound == false) + && (endOfReaderReached == false)) { + charsRead = reader.read(tempChunkBuf, 0, 1); + if (-1 == charsRead) { + //this is the last chunk + endOfReaderReached = true; + return; + } else { + //if the last charcter might be part of a surroate pair, read another char + final char ch = tempChunkBuf[0]; + String chunkSegment; + if (Character.isHighSurrogate(ch)) { + charsRead = reader.read(tempChunkBuf, 1, 1); + if (charsRead == -1) { + //this is the last chunk, so include the unpaired surrogate + currentSegment.append(ch); + chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length; + endOfReaderReached = true; + return; + } else { + //use the surrogate pair in place of the unpaired surrogate. + chunkSegment = new String(tempChunkBuf, 0, 2); + } + } else { + //one char + chunkSegment = new String(tempChunkBuf, 0, 1); + } + //check for whitespace. + whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0)); + //add read chars to the chunk and update the length. + currentSegment.append(chunkSegment); + chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length; + } + } + } + + /** + * Represents one chunk as the text in it and the length of the base chunk, + * in chars. + */ + static class Chunk { + + private final StringBuilder sb; + private final int chunksize; + + Chunk(StringBuilder sb, int baseChunkLength) { + this.sb = sb; + this.chunksize = baseChunkLength; + } + + @Override + public String toString() { + return sb.toString(); + } + + int getBaseChunkLength() { + return chunksize; + } + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java index 55838f4e7f..689f42591b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java @@ -95,7 +95,7 @@ abstract class FileTextExtractor implements TextExtractor< AbstractFile> { abstract boolean isSupported(AbstractFile file, String detectedFormat); @Override - public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException; + public abstract Reader getReader(AbstractFile source) throws TextExtractorException; @Override public long getID(AbstractFile source) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java index e758ef86a0..f72b02d1eb 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java @@ -65,7 +65,7 @@ class HtmlTextExtractor extends FileTextExtractor { } @Override - public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException { + public Reader getReader(AbstractFile sourceFile) throws TextExtractorException { ReadContentInputStream stream = new ReadContentInputStream(sourceFile); //Parse the stream with Jericho and put the results in a Reader @@ -159,7 +159,7 @@ class HtmlTextExtractor extends FileTextExtractor { // All done, now make it a reader return new StringReader(stringBuilder.toString()); } catch (IOException ex) { - throw new Ingester.IngesterException("Error extracting HTML from content.", ex); + throw new TextExtractorException("Error extracting HTML from content.", ex); } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 2e1c6feb2d..0dc221356c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -18,22 +18,17 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import com.google.common.base.Utf8; import java.io.BufferedReader; -import java.io.IOException; import java.util.HashMap; -import java.util.Iterator; import java.util.Map; -import java.util.NoSuchElementException; import java.util.logging.Level; -import javax.annotation.concurrent.NotThreadSafe; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.common.SolrInputDocument; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.coreutils.TextUtil; import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.ingest.IngestJobContext; +import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.DerivedFile; @@ -149,8 +144,8 @@ class Ingester { int numChunks = 0; //unknown until chunking is done if (extractor.isDisabled()) { - /* some Extrctors, notable the strings extractor, have options which - * can be configured such that no extraction should be done */ + /* some Extractors, notable the strings extractor, have options + * which can be configured such that no extraction should be done */ return true; } @@ -171,13 +166,12 @@ class Ingester { + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS throw ingEx; //need to rethrow to signal error and move on - } catch (Exception ex) { - throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); } } - } catch (IOException ex) { - extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS - return false; + if (chunker.hasException()) { + extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException()); + return false; + } } catch (Exception ex) { extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS return false; @@ -192,7 +186,7 @@ class Ingester { } /** - * Add one chunk as to the Solr index as a seperate sold document. + * Add one chunk as to the Solr index as a separate Solr document. * * TODO see if can use a byte or string streaming way to add content to * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr @@ -232,7 +226,7 @@ class Ingester { uncommitedIngests = true; } catch (KeywordSearchModuleException ex) { - //JMTODO: does this need to ne internationalized? + //JMTODO: does this need to be internationalized? throw new IngesterException( NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex); } @@ -370,211 +364,3 @@ class Ingester { } } } - -/** - * Encapsulates the content chunking algorithm in an implementation of the - * Iterator interface. Also implements Iterable so it can be used directly in a - * for loop. The base chunk is the part of the chunk before the overlapping - * window. The window will be included at the end of the current chunk as well - * as at the beginning of the next chunk. - */ -@NotThreadSafe -class Chunker implements Iterator, Iterable { - - //Chunking algorithm paramaters-------------------------------------// - /** the maximum size of a chunk, including the window. */ - private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes - /** the minimum to read before we start the process of looking for - * whitespace to break at and creating an overlapping window. */ - private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes - /** The maximum size of the chunk, before the overlapping window, even if we - * couldn't find whitespace to break at. */ - private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes - /** The amount of text we will read through before we give up on finding - * whitespace to break the chunk/window at. */ - private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes - /** The number of characters to read in one go from the Reader. */ - private static final int READ_CHARS_BUFFER_SIZE = 512; //chars - - ////chunker state--------------------------------------------/// - /** The Reader that this chunk reads from, and divides into chunks. It must - * be a buffered reader to ensure that mark/reset are supported. */ - private final BufferedReader reader; - /** The local buffer of characters read from the Reader. */ - private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE]; - /** number of chars read in the most recent read operation. */ - private int charsRead = 0; - - /** The text of the current chunk (so far). */ - private StringBuilder currentChunk; - /** the size in bytes of the chunk (so far). */ - private int chunkSizeBytes = 0; - /** the size in chars of the (base) chunk (so far). */ - private int baseChunkSizeChars; - - /** has the chunker found whitespace to break on? */ - private boolean whitespaceFound = false; - /** has the chunker reached the end of the Reader? If so, there are no more - * chunks, and the current chunk does not need a window. */ - private boolean endOfReaderReached = false; - - /** - * Create a Chunker that will chunk the content of the given Reader. - * - * @param reader The content to chunk. - */ - Chunker(BufferedReader reader) { - this.reader = reader; - } - - @Override - public Iterator iterator() { - return this; - } - - @Override - public boolean hasNext() { - return endOfReaderReached == false; - } - - /** - * Sanitize the given StringBuilder by replacing non-UTF-8 characters with - * caret '^' - * - * @param sb the StringBuilder to sanitize - * - * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping - * function? - */ - private static StringBuilder sanitizeToUTF8(StringBuilder sb) { - final int length = sb.length(); - for (int i = 0; i < length; i++) { - if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) { - sb.replace(i, i + 1, "^"); - } - } - return sb; - } - - @Override - public Chunk next() { - if (endOfReaderReached) { - throw new NoSuchElementException("There are no more chunks."); - } - //reset state for the next chunk - currentChunk = new StringBuilder(); - chunkSizeBytes = 0; - baseChunkSizeChars = 0; - - try { - readBaseChunk(); - baseChunkSizeChars = currentChunk.length(); - reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk - readWindow(); - } catch (IOException ioEx) { - throw new RuntimeException("IOException while reading chunk.", ioEx); - } - try { - reader.reset(); //reset the reader the so the next chunk can begin at the position marked above - } catch (IOException ex) { - throw new RuntimeException("IOException while resetting chunk reader.", ex); - } - - if (endOfReaderReached) { - /* if we have reached the end of the content,we won't make another - * overlapping chunk, so the base chunk can be extended to the end. */ - baseChunkSizeChars = currentChunk.length(); - } - //sanitize the text and return a Chunk object, that includes the base chunk length. - return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); - } - - /** - * Read the base chunk from the reader, and attempt to break at whitespace. - * - * @throws IOException if there is a problem reading from the reader. - */ - private void readBaseChunk() throws IOException { - //read the chunk until the minimum base chunk size - readHelper(MINIMUM_BASE_CHUNK_SIZE, false); - //keep reading until the maximum base chunk size or white space is reached. - whitespaceFound = false; - readHelper(MAXIMUM_BASE_CHUNK_SIZE, true); - - } - - /** - * Read the window from the reader, and attempt to break at whitespace. - * - * @throws IOException if there is a problem reading from the reader. - */ - private void readWindow() throws IOException { - //read the window, leaving some room to look for white space to break at. - int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024); - readHelper(windowEnd, false); - whitespaceFound = false; - //keep reading until the max chunk size, or until whitespace is reached. - windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024); - readHelper(windowEnd, true); - } - - /** Helper method that implements reading in a loop. - * - * @param maxBytes The max cummulative length of the content,in - * bytes, to read from the Reader. That is, when - * chunkSizeBytes >= maxBytes stop reading. - * @param inWhiteSpaceBuffer Should the current read stop once whitespace is - * found? - * - * @throws IOException If there is a problem reading from the Reader. - */ - private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException { - //only read one character at a time if we are looking for whitespace. - final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE; - - //read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader. - while ((chunkSizeBytes < maxBytes) - && (false == (inWhiteSpaceBuffer && whitespaceFound)) - && (endOfReaderReached == false)) { - charsRead = reader.read(tempChunkBuf, 0, readSize); - if (-1 == charsRead) { - //this is the last chunk - endOfReaderReached = true; - } else { - if (inWhiteSpaceBuffer) { - //chec for whitespace. - whitespaceFound = Character.isWhitespace(tempChunkBuf[0]); - } - - //add read chars to the chunk and update the length. - String chunkSegment = new String(tempChunkBuf, 0, charsRead); - chunkSizeBytes += Utf8.encodedLength(chunkSegment); - currentChunk.append(chunkSegment); - } - } - } -} - -/** - * Represents one chunk as the text in it and the length of the base chunk, in - * chars. - */ -class Chunk { - - private final StringBuilder sb; - private final int chunksize; - - Chunk(StringBuilder sb, int baseChunkLength) { - this.sb = sb; - this.chunksize = baseChunkLength; - } - - @Override - public String toString() { - return sb.toString(); - } - - int getBaseChunkLength() { - return chunksize; - } -} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java index 23c49c255f..919510332b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java @@ -107,7 +107,7 @@ class StringsTextExtractor extends FileTextExtractor { } @Override - public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException { + public InputStreamReader getReader(AbstractFile sourceFile) throws TextExtractorException { InputStream stringStream = getInputStream(sourceFile); return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java index 6ea27e733b..94abb940eb 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java @@ -30,7 +30,6 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem; */ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { - /** * Is this extractor configured such that no extraction will/should be done? * @@ -46,7 +45,6 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { */ abstract void logWarning(String msg, Exception ex); - /** * Get a reader that over the text extracted from the given source. * @@ -57,7 +55,7 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - abstract Reader getReader(TextSource source) throws Ingester.IngesterException; + abstract Reader getReader(TextSource source) throws TextExtractorException; /** * Get the 'object' id of the given source. @@ -76,4 +74,15 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { * @return */ abstract String getName(TextSource source); + + class TextExtractorException extends Exception { + + public TextExtractorException(String message) { + super(message); + } + + public TextExtractorException(String message, Throwable cause) { + super(message, cause); + } + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java index 7ac5392016..3a494be9c6 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java @@ -36,7 +36,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -67,7 +66,7 @@ class TikaTextExtractor extends FileTextExtractor { } @Override - public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException { + public Reader getReader(AbstractFile sourceFile) throws TextExtractorException, MissingResourceException { ReadContentInputStream stream = new ReadContentInputStream(sourceFile); Metadata metadata = new Metadata(); @@ -81,12 +80,12 @@ class TikaTextExtractor extends FileTextExtractor { } catch (TimeoutException te) { final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName()); logWarning(msg, te); - throw new IngesterException(msg); + throw new TextExtractorException(msg, te); } catch (Exception ex) { KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName()); logWarning(msg, ex); - throw new IngesterException(msg, ex); + throw new TextExtractorException(msg, ex); } }