Merge branch 'develop' of https://github.com/sleuthkit/autopsy into 2197-ProfileOptionsPanel

This commit is contained in:
William Schaefer 2017-01-11 14:39:30 -05:00
commit 35bf21eefe
11 changed files with 468 additions and 247 deletions

View File

@ -19,6 +19,8 @@
package org.sleuthkit.autopsy.datamodel;
import java.awt.event.ActionEvent;
import java.beans.PropertyChangeEvent;
import java.beans.PropertyChangeListener;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
@ -27,6 +29,7 @@ import java.util.List;
import java.util.logging.Level;
import javax.swing.AbstractAction;
import javax.swing.Action;
import org.openide.nodes.Children;
import org.openide.nodes.Sheet;
import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages;
@ -35,11 +38,14 @@ import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.directorytree.ExplorerNodeActionVisitor;
import org.sleuthkit.autopsy.directorytree.FileSearchAction;
import org.sleuthkit.autopsy.directorytree.NewWindowViewAction;
import org.sleuthkit.autopsy.ingest.IngestManager;
import org.sleuthkit.autopsy.ingest.ModuleContentEvent;
import org.sleuthkit.autopsy.ingest.RunIngestModulesDialog;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Image;
import org.sleuthkit.datamodel.SleuthkitCase.CaseDbQuery;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.VirtualDirectory;
/**
* This class is used to represent the "Node" for the image. The children of
@ -71,6 +77,16 @@ public class ImageNode extends AbstractContentNode<Image> {
String imgName = nameForImage(img);
this.setDisplayName(imgName);
this.setIconBaseWithExtension("org/sleuthkit/autopsy/images/hard-drive-icon.jpg"); //NON-NLS
// Listen for ingest events so that we can detect new added files (e.g. carved)
IngestManager.getInstance().addIngestModuleEventListener(pcl);
// Listen for case events so that we can detect when case is closed
Case.addPropertyChangeListener(pcl);
}
private void removeListeners() {
IngestManager.getInstance().removeIngestModuleEventListener(pcl);
Case.removePropertyChangeListener(pcl);
}
/**
@ -199,4 +215,46 @@ public class ImageNode extends AbstractContentNode<Image> {
public String getItemType() {
return getClass().getName();
}
private final PropertyChangeListener pcl = (PropertyChangeEvent evt) -> {
String eventType = evt.getPropertyName();
// See if the new file is a child of ours
if (eventType.equals(IngestManager.IngestModuleEvent.CONTENT_CHANGED.toString())) {
if ((evt.getOldValue() instanceof ModuleContentEvent) == false) {
return;
}
ModuleContentEvent moduleContentEvent = (ModuleContentEvent) evt.getOldValue();
if ((moduleContentEvent.getSource() instanceof Content) == false) {
return;
}
Content newContent = (Content) moduleContentEvent.getSource();
try {
Content parent = newContent.getParent();
if (parent != null) {
// Is this a new carved file?
if (parent.getName().equals(VirtualDirectory.NAME_CARVED)) {
// Was this new carved file produced from this image?
if (parent.getParent().getId() == getContent().getId()) {
Children children = getChildren();
if (children != null) {
((ContentChildren) children).refreshChildren();
children.getNodesCount();
}
}
}
}
} catch (TskCoreException ex) {
// Do nothing.
}
} else if (eventType.equals(Case.Events.CURRENT_CASE.toString())) {
if (evt.getNewValue() == null) {
// case was closed. Remove listeners so that we don't get called with a stale case handle
removeListeners();
}
}
};
}

View File

@ -82,7 +82,13 @@ ConfirmationDialog.Exit=Exit
ConfirmationDialog.DoNotExit=Do Not Exit
ConfirmationDialog.ConfirmExit=All incomplete copy jobs will be cancelled. Are you sure?
ConfirmationDialog.ConfirmExitHeader=Confirm Exit
OpenIDE-Module-Long-Description=\
This module contains features that are being developed by Basis Technology and are not part of the default Autopsy distribution. \
You can enable this module to use the new features. \
The features should be stable, but their exact behavior and API are subject to change. \n\n\
We make no guarantee that the API of this module will not change, so developers should be careful when relying on it.
OpenIDE-Module-Name=Experimental
OpenIDE-Module-Short-Description=This module contains features that are being developed by Basis Technology and are not part of the default Autopsy distribution.
ReviewModeCasePanel.bnRefresh.text=&Refresh
ReviewModeCasePanel.bnOpen.text=&Open
ReviewModeCasePanel.rbGroupLabel.text=Show Last 10:

View File

@ -526,6 +526,7 @@
<!-- file chunk-specific fields (optional for others) -->
<!-- for a parent file with no content, number of chunks are specified -->
<field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
<field name="chunk_size" type="int" indexed="true" stored="true" required="false" />
<!-- Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.

View File

@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskCoreException;
* Extracts text from artifacts by concatenating the values of all of the
* artifact's attributes.
*/
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
/**
@ -82,13 +82,16 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
}
@Override
boolean isDisabled() {
public boolean isDisabled() {
return false;
}
@Override
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
@Override
InputStream getInputStream(BlackboardArtifact artifact) {
private InputStream getInputStream(BlackboardArtifact artifact) {
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
@ -127,17 +130,17 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
}
@Override
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
return new InputStreamReader(stream, StandardCharsets.UTF_8);
public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
}
@Override
long getID(BlackboardArtifact source) {
public long getID(BlackboardArtifact source) {
return source.getArtifactID();
}
@Override
String getName(BlackboardArtifact source) {
public String getName(BlackboardArtifact source) {
return source.getDisplayName() + "_" + source.getArtifactID();
}
}

View File

@ -18,7 +18,6 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
@ -28,7 +27,7 @@ import org.sleuthkit.datamodel.AbstractFile;
* Common methods for utilities that extract text and content and divide into
* chunks
*/
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
abstract class FileTextExtractor implements TextExtractor< AbstractFile> {
static final List<String> BLOB_MIME_TYPES
@ -96,17 +95,16 @@ abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
abstract boolean isSupported(AbstractFile file, String detectedFormat);
@Override
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;
@Override
long getID(AbstractFile source) {
public long getID(AbstractFile source) {
return source.getId();
}
@Override
String getName(AbstractFile source) {
public String getName(AbstractFile source) {
return source.getName();
}
}

View File

@ -19,26 +19,28 @@
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Renderer;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extractor of text from HTML supported AbstractFile content. Extracted text
* will be * divided into chunks and indexed with Solr. If HTML extraction succeeds,
* chunks are indexed with Solr.
* will be * divided into chunks and indexed with Solr. If HTML extraction
* succeeds, chunks are indexed with Solr.
*/
class HtmlTextExtractor extends FileTextExtractor {
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private static final int MAX_SIZE = 50_000_000; //50MB
static final List<String> WEB_MIME_TYPES = Arrays.asList(
@ -63,7 +65,9 @@ class HtmlTextExtractor extends FileTextExtractor {
}
@Override
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
//Parse the stream with Jericho and put the results in a Reader
try {
StringBuilder scripts = new StringBuilder();
@ -77,7 +81,7 @@ class HtmlTextExtractor extends FileTextExtractor {
int numComments = 0;
int numOthers = 0;
Source source = new Source(in);
Source source = new Source(stream);
source.fullSequentialParse();
Renderer renderer = source.getRenderer();
renderer.setNewLine("\n");
@ -160,12 +164,11 @@ class HtmlTextExtractor extends FileTextExtractor {
}
@Override
InputStream getInputStream(AbstractFile sourceFile1) {
return new ReadContentInputStream(sourceFile1);
}
@Override
boolean isDisabled() {
public boolean isDisabled() {
return false;
}
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
}

View File

@ -18,12 +18,15 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import com.google.common.base.Utf8;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.logging.Level;
import javax.annotation.concurrent.NotThreadSafe;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrInputDocument;
import org.openide.util.NbBundle;
@ -55,9 +58,7 @@ class Ingester {
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
private static Ingester instance;
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128;
private static final int SINGLE_READ_CHARS = 512;
private Ingester() {
}
@ -121,6 +122,136 @@ class Ingester {
return item.accept(SOLR_FIELDS_VISITOR);
}
/**
* Use the given TextExtractor to extract text from the given source. The
* text will be chunked and each chunk passed to Solr to add to the index.
*
*
* @param <A> The type of the Appendix provider that provides
* additional text to append to the final chunk.
* @param <T> A subclass of SleuthkitVisibleItem.
* @param extractor The TextExtractor that will be used to extract text from
* the given source.
* @param source The source from which text will be extracted, chunked,
* and indexed.
* @param context The ingest job context that can be used to cancel this
* process.
*
* @return True if this method executed normally. or False if there was an
* unexpected exception. //JMTODO: This policy needs to be reviewed.
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
final long sourceID = extractor.getID(source);
final String sourceName = extractor.getName(source);
int numChunks = 0; //unknown until chunking is done
if (extractor.isDisabled()) {
/* some Extrctors, notable the strings extractor, have options which
* can be configured such that no extraction should be done */
return true;
}
Map<String, String> fields = getContentFields(source);
//Get a reader for the content of the given source
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
Chunker chunker = new Chunker(reader);
for (Chunk chunk : chunker) {
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
fields.put(Server.Schema.ID.toString(), chunkId);
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
try {
//add the chunk text to Solr index
indexChunk(chunk.toString(), sourceName, fields);
numChunks++;
} catch (Ingester.IngesterException ingEx) {
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
throw ingEx; //need to rethrow to signal error and move on
} catch (Exception ex) {
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
}
}
} catch (IOException ex) {
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} catch (Exception ex) {
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} finally {
//after all chunks, index just the meta data, including the numChunks, of the parent file
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
indexChunk(null, sourceName, fields);
}
return true;
}
/**
* Add one chunk as to the Solr index as a seperate sold document.
*
* TODO see if can use a byte or string streaming way to add content to
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
* 4.0.0), see if possible to stream with UpdateRequestHandler
*
* @param chunk The chunk content as a string
* @param fields
* @param size
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
//JMTODO: actually if the we couldn't get the image id it is set to -1,
// but does this really mean we don't want to index it?
//skip the file, image id unknown
//JMTODO: does this need to ne internationalized?
String msg = NbBundle.getMessage(Ingester.class,
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
logger.log(Level.SEVERE, msg);
throw new IngesterException(msg);
}
//Make a SolrInputDocument out of the field map
SolrInputDocument updateDoc = new SolrInputDocument();
for (String key : fields.keySet()) {
updateDoc.addField(key, fields.get(key));
}
//add the content to the SolrInputDocument
//JMTODO: can we just add it to the field map before passing that in?
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
try {
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
solrServer.addDocument(updateDoc);
uncommitedIngests = true;
} catch (KeywordSearchModuleException ex) {
//JMTODO: does this need to ne internationalized?
throw new IngesterException(
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
}
}
/**
* Tells Solr to commit (necessary before ingested files will appear in
* searches)
*/
void commit() {
try {
solrServer.commit();
uncommitedIngests = false;
} catch (NoOpenCoreException | SolrServerException ex) {
logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
}
}
/**
* Visitor used to create fields to send to SOLR index.
*/
@ -222,192 +353,6 @@ class Ingester {
}
}
/**
* Use the given TextExtractor to extract text from the given source. The
* text will be chunked and each chunk passed to Solr to add to the index.
*
*
* @param <A> The type of the Appendix provider that provides
* additional text to append to the final chunk.
* @param <T> A subclass of SleuthkitVisibleItem.
* @param extractor The TextExtractor that will be used to extract text from
* the given source.
* @param source The source from which text will be extracted, chunked,
* and indexed.
* @param context The ingest job context that can be used to cancel this
* process.
*
* @return True if this method executed normally. or False if there was an
* unexpected exception. //JMTODO: This policy needs to be reviewed.
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
final long sourceID = extractor.getID(source);
final String sourceName = extractor.getName(source);
int numChunks = 0; //unknown until chunking is done
if (extractor.isDisabled()) {
/* some Extrctors, notable the strings extractor, have options which
* can be configured such that no extraction should be done */
return true;
}
Map<String, String> fields = getContentFields(source);
//Get a stream and a reader for that stream
try (final InputStream stream = extractor.getInputStream(source);
Reader reader = extractor.getReader(stream, source);) {
//we read max 1024 chars at time, this seems to max what some Readers would return
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
boolean eof = false; //have we read until the end of the file yet
while (!eof) {
int chunkSizeInChars = 0; // the size in chars of the chunk (so far)
if (context != null && context.fileIngestIsCancelled()) {
return true;
}
long charsRead = 0; // number of chars read in the most recent read operation
//consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
chunkSizeInChars += charsRead;
}
if (charsRead == -1) {
//this is the last chunk
eof = true;
} else {
chunkSizeInChars += charsRead;
//if we haven't reached the end of the file,
//try to read char-by-char until whitespace to not break words
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
&& (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
chunkSizeInChars += charsRead;
}
if (charsRead == -1) {
//this is the last chunk
eof = true;
}
}
StringBuilder sb = new StringBuilder(chunkSizeInChars)
.append(textChunkBuf, 0, chunkSizeInChars);
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
fields.put(Server.Schema.ID.toString(), chunkId);
try {
//pass the chunk to method that adds it to Solr index
indexChunk(sb.toString(), sourceName, fields);
numChunks++;
} catch (Ingester.IngesterException ingEx) {
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
throw ingEx; //need to rethrow to signal error and move on
} catch (Exception ex) {
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
}
}
} catch (IOException ex) {
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} catch (Exception ex) {
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} finally {
//after all chunks, index just the meta data, including the numChunks, of the parent file
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
indexChunk(null, sourceName, fields);
}
return true;
}
/**
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
* caret '^'
*
* @param sb the StringBuilder to sanitize
*
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
* function?
*/
private static void sanitizeToUTF8(StringBuilder sb) {
final int length = sb.length();
// Sanitize by replacing non-UTF-8 characters with caret '^'
for (int i = 0; i < length; i++) {
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
sb.replace(i, i + 1, "^");
}
}
}
/**
* Add one chunk as to the Solr index as a seperate sold document.
*
* TODO see if can use a byte or string streaming way to add content to
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
* 4.0.0), see if possible to stream with UpdateRequestHandler
*
* @param chunk The chunk content as a string
* @param fields
* @param size
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
//JMTODO: actually if the we couldn't get the image id it is set to -1,
// but does this really mean we don't want to index it?
//skip the file, image id unknown
//JMTODO: does this need to ne internationalized?
String msg = NbBundle.getMessage(Ingester.class,
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
logger.log(Level.SEVERE, msg);
throw new IngesterException(msg);
}
//Make a SolrInputDocument out of the field map
SolrInputDocument updateDoc = new SolrInputDocument();
for (String key : fields.keySet()) {
updateDoc.addField(key, fields.get(key));
}
//add the content to the SolrInputDocument
//JMTODO: can we just add it to the field map before passing that in?
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
try {
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
solrServer.addDocument(updateDoc);
uncommitedIngests = true;
} catch (KeywordSearchModuleException ex) {
//JMTODO: does this need to ne internationalized?
throw new IngesterException(
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
}
}
/**
* Tells Solr to commit (necessary before ingested files will appear in
* searches)
*/
void commit() {
try {
solrServer.commit();
uncommitedIngests = false;
} catch (NoOpenCoreException | SolrServerException ex) {
logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
}
}
/**
* Indicates that there was an error with the specific ingest operation, but
* it's still okay to continue ingesting files.
@ -425,3 +370,211 @@ class Ingester {
}
}
}
/**
* Encapsulates the content chunking algorithm in an implementation of the
* Iterator interface. Also implements Iterable so it can be used directly in a
* for loop. The base chunk is the part of the chunk before the overlapping
* window. The window will be included at the end of the current chunk as well
* as at the beginning of the next chunk.
*/
@NotThreadSafe
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
//Chunking algorithm paramaters-------------------------------------//
/** the maximum size of a chunk, including the window. */
private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
/** the minimum to read before we start the process of looking for
* whitespace to break at and creating an overlapping window. */
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
/** The maximum size of the chunk, before the overlapping window, even if we
* couldn't find whitespace to break at. */
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
/** The amount of text we will read through before we give up on finding
* whitespace to break the chunk/window at. */
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
/** The number of characters to read in one go from the Reader. */
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
////chunker state--------------------------------------------///
/** The Reader that this chunk reads from, and divides into chunks. It must
* be a buffered reader to ensure that mark/reset are supported. */
private final BufferedReader reader;
/** The local buffer of characters read from the Reader. */
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
/** number of chars read in the most recent read operation. */
private int charsRead = 0;
/** The text of the current chunk (so far). */
private StringBuilder currentChunk;
/** the size in bytes of the chunk (so far). */
private int chunkSizeBytes = 0;
/** the size in chars of the (base) chunk (so far). */
private int baseChunkSizeChars;
/** has the chunker found whitespace to break on? */
private boolean whitespaceFound = false;
/** has the chunker reached the end of the Reader? If so, there are no more
* chunks, and the current chunk does not need a window. */
private boolean endOfReaderReached = false;
/**
* Create a Chunker that will chunk the content of the given Reader.
*
* @param reader The content to chunk.
*/
Chunker(BufferedReader reader) {
this.reader = reader;
}
@Override
public Iterator<Chunk> iterator() {
return this;
}
@Override
public boolean hasNext() {
return endOfReaderReached == false;
}
/**
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
* caret '^'
*
* @param sb the StringBuilder to sanitize
*
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
* function?
*/
private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
final int length = sb.length();
for (int i = 0; i < length; i++) {
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
sb.replace(i, i + 1, "^");
}
}
return sb;
}
@Override
public Chunk next() {
if (endOfReaderReached) {
throw new NoSuchElementException("There are no more chunks.");
}
//reset state for the next chunk
currentChunk = new StringBuilder();
chunkSizeBytes = 0;
baseChunkSizeChars = 0;
try {
readBaseChunk();
baseChunkSizeChars = currentChunk.length();
reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk
readWindow();
} catch (IOException ioEx) {
throw new RuntimeException("IOException while reading chunk.", ioEx);
}
try {
reader.reset(); //reset the reader the so the next chunk can begin at the position marked above
} catch (IOException ex) {
throw new RuntimeException("IOException while resetting chunk reader.", ex);
}
if (endOfReaderReached) {
/* if we have reached the end of the content,we won't make another
* overlapping chunk, so the base chunk can be extended to the end. */
baseChunkSizeChars = currentChunk.length();
}
//sanitize the text and return a Chunk object, that includes the base chunk length.
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
}
/**
* Read the base chunk from the reader, and attempt to break at whitespace.
*
* @throws IOException if there is a problem reading from the reader.
*/
private void readBaseChunk() throws IOException {
//read the chunk until the minimum base chunk size
readHelper(MINIMUM_BASE_CHUNK_SIZE, false);
//keep reading until the maximum base chunk size or white space is reached.
whitespaceFound = false;
readHelper(MAXIMUM_BASE_CHUNK_SIZE, true);
}
/**
* Read the window from the reader, and attempt to break at whitespace.
*
* @throws IOException if there is a problem reading from the reader.
*/
private void readWindow() throws IOException {
//read the window, leaving some room to look for white space to break at.
int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
readHelper(windowEnd, false);
whitespaceFound = false;
//keep reading until the max chunk size, or until whitespace is reached.
windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024);
readHelper(windowEnd, true);
}
/** Helper method that implements reading in a loop.
*
* @param maxBytes The max cummulative length of the content,in
* bytes, to read from the Reader. That is, when
* chunkSizeBytes >= maxBytes stop reading.
* @param inWhiteSpaceBuffer Should the current read stop once whitespace is
* found?
*
* @throws IOException If there is a problem reading from the Reader.
*/
private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException {
//only read one character at a time if we are looking for whitespace.
final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE;
//read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader.
while ((chunkSizeBytes < maxBytes)
&& (false == (inWhiteSpaceBuffer && whitespaceFound))
&& (endOfReaderReached == false)) {
charsRead = reader.read(tempChunkBuf, 0, readSize);
if (-1 == charsRead) {
//this is the last chunk
endOfReaderReached = true;
} else {
if (inWhiteSpaceBuffer) {
//chec for whitespace.
whitespaceFound = Character.isWhitespace(tempChunkBuf[0]);
}
//add read chars to the chunk and update the length.
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
currentChunk.append(chunkSegment);
}
}
}
}
/**
* Represents one chunk as the text in it and the length of the base chunk, in
* chars.
*/
class Chunk {
private final StringBuilder sb;
private final int chunksize;
Chunk(StringBuilder sb, int baseChunkLength) {
this.sb = sb;
this.chunksize = baseChunkLength;
}
@Override
public String toString() {
return sb.toString();
}
int getBaseChunkLength() {
return chunksize;
}
}

View File

@ -149,6 +149,12 @@ public class Server {
return "num_chunks"; //NON-NLS
}
},
CHUNK_SIZE {
@Override
public String toString() {
return "chunk_size"; //NON-NLS
}
}
};
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)

View File

@ -25,6 +25,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
@ -37,6 +38,8 @@ import org.sleuthkit.datamodel.TskException;
*/
class StringsTextExtractor extends FileTextExtractor {
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
/**
* Options for this extractor
*/
@ -91,7 +94,12 @@ class StringsTextExtractor extends FileTextExtractor {
}
@Override
boolean isDisabled() {
public void logWarning(final String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
@Override
public boolean isDisabled() {
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
@ -99,11 +107,11 @@ class StringsTextExtractor extends FileTextExtractor {
}
@Override
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
InputStream stringStream = getInputStream(sourceFile);
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
}
@Override
InputStream getInputStream(AbstractFile sourceFile) {
//check which extract stream to use
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {

View File

@ -18,10 +18,7 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream;
import java.io.Reader;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
/**
@ -31,9 +28,8 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
* is able to process.
*/
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
/**
* Is this extractor configured such that no extraction will/should be done?
@ -48,18 +44,8 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
* @param msg
* @param ex
*/
void logWarning(String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
abstract void logWarning(String msg, Exception ex);
/**
* Get an input stream over the content of the given source.
*
* @param source
*
* @return
*/
abstract InputStream getInputStream(TextSource source);
/**
* Get a reader that over the text extracted from the given source.
@ -71,7 +57,7 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
abstract Reader getReader(TextSource source) throws Ingester.IngesterException;
/**
* Get the 'object' id of the given source.

View File

@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch;
import com.google.common.io.CharSource;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.List;
import java.util.MissingResourceException;
@ -36,6 +35,7 @@ import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
@ -51,22 +51,25 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
*/
class TikaTextExtractor extends FileTextExtractor {
static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext())
.parallelStream()
.stream()
.map(mt -> mt.getType() + "/" + mt.getSubtype())
.collect(Collectors.toList());
@Override
void logWarning(final String msg, Exception ex) {
public void logWarning(final String msg, Exception ex) {
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
super.logWarning(msg, ex);
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
@Override
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
Metadata metadata = new Metadata();
//Parse the file in a task, a convenient way to have a timeout...
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
@ -125,13 +128,9 @@ class TikaTextExtractor extends FileTextExtractor {
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
}
@Override
InputStream getInputStream(AbstractFile sourceFile1) {
return new ReadContentInputStream(sourceFile1);
}
@Override
boolean isDisabled() {
public boolean isDisabled() {
return false;
}