mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-16 09:47:42 +00:00
Merge branch 'develop' of https://github.com/sleuthkit/autopsy into 2197-ProfileOptionsPanel
This commit is contained in:
commit
35bf21eefe
@ -19,6 +19,8 @@
|
||||
package org.sleuthkit.autopsy.datamodel;
|
||||
|
||||
import java.awt.event.ActionEvent;
|
||||
import java.beans.PropertyChangeEvent;
|
||||
import java.beans.PropertyChangeListener;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
@ -27,6 +29,7 @@ import java.util.List;
|
||||
import java.util.logging.Level;
|
||||
import javax.swing.AbstractAction;
|
||||
import javax.swing.Action;
|
||||
import org.openide.nodes.Children;
|
||||
import org.openide.nodes.Sheet;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.NbBundle.Messages;
|
||||
@ -35,11 +38,14 @@ import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.directorytree.ExplorerNodeActionVisitor;
|
||||
import org.sleuthkit.autopsy.directorytree.FileSearchAction;
|
||||
import org.sleuthkit.autopsy.directorytree.NewWindowViewAction;
|
||||
import org.sleuthkit.autopsy.ingest.IngestManager;
|
||||
import org.sleuthkit.autopsy.ingest.ModuleContentEvent;
|
||||
import org.sleuthkit.autopsy.ingest.RunIngestModulesDialog;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.Image;
|
||||
import org.sleuthkit.datamodel.SleuthkitCase.CaseDbQuery;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
import org.sleuthkit.datamodel.VirtualDirectory;
|
||||
|
||||
/**
|
||||
* This class is used to represent the "Node" for the image. The children of
|
||||
@ -71,6 +77,16 @@ public class ImageNode extends AbstractContentNode<Image> {
|
||||
String imgName = nameForImage(img);
|
||||
this.setDisplayName(imgName);
|
||||
this.setIconBaseWithExtension("org/sleuthkit/autopsy/images/hard-drive-icon.jpg"); //NON-NLS
|
||||
|
||||
// Listen for ingest events so that we can detect new added files (e.g. carved)
|
||||
IngestManager.getInstance().addIngestModuleEventListener(pcl);
|
||||
// Listen for case events so that we can detect when case is closed
|
||||
Case.addPropertyChangeListener(pcl);
|
||||
}
|
||||
|
||||
private void removeListeners() {
|
||||
IngestManager.getInstance().removeIngestModuleEventListener(pcl);
|
||||
Case.removePropertyChangeListener(pcl);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -199,4 +215,46 @@ public class ImageNode extends AbstractContentNode<Image> {
|
||||
public String getItemType() {
|
||||
return getClass().getName();
|
||||
}
|
||||
|
||||
private final PropertyChangeListener pcl = (PropertyChangeEvent evt) -> {
|
||||
String eventType = evt.getPropertyName();
|
||||
|
||||
// See if the new file is a child of ours
|
||||
if (eventType.equals(IngestManager.IngestModuleEvent.CONTENT_CHANGED.toString())) {
|
||||
if ((evt.getOldValue() instanceof ModuleContentEvent) == false) {
|
||||
return;
|
||||
}
|
||||
ModuleContentEvent moduleContentEvent = (ModuleContentEvent) evt.getOldValue();
|
||||
if ((moduleContentEvent.getSource() instanceof Content) == false) {
|
||||
return;
|
||||
}
|
||||
Content newContent = (Content) moduleContentEvent.getSource();
|
||||
|
||||
try {
|
||||
Content parent = newContent.getParent();
|
||||
if (parent != null) {
|
||||
// Is this a new carved file?
|
||||
if (parent.getName().equals(VirtualDirectory.NAME_CARVED)) {
|
||||
// Was this new carved file produced from this image?
|
||||
if (parent.getParent().getId() == getContent().getId()) {
|
||||
Children children = getChildren();
|
||||
if (children != null) {
|
||||
((ContentChildren) children).refreshChildren();
|
||||
children.getNodesCount();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (TskCoreException ex) {
|
||||
// Do nothing.
|
||||
}
|
||||
} else if (eventType.equals(Case.Events.CURRENT_CASE.toString())) {
|
||||
if (evt.getNewValue() == null) {
|
||||
// case was closed. Remove listeners so that we don't get called with a stale case handle
|
||||
removeListeners();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
@ -82,7 +82,13 @@ ConfirmationDialog.Exit=Exit
|
||||
ConfirmationDialog.DoNotExit=Do Not Exit
|
||||
ConfirmationDialog.ConfirmExit=All incomplete copy jobs will be cancelled. Are you sure?
|
||||
ConfirmationDialog.ConfirmExitHeader=Confirm Exit
|
||||
OpenIDE-Module-Long-Description=\
|
||||
This module contains features that are being developed by Basis Technology and are not part of the default Autopsy distribution. \
|
||||
You can enable this module to use the new features. \
|
||||
The features should be stable, but their exact behavior and API are subject to change. \n\n\
|
||||
We make no guarantee that the API of this module will not change, so developers should be careful when relying on it.
|
||||
OpenIDE-Module-Name=Experimental
|
||||
OpenIDE-Module-Short-Description=This module contains features that are being developed by Basis Technology and are not part of the default Autopsy distribution.
|
||||
ReviewModeCasePanel.bnRefresh.text=&Refresh
|
||||
ReviewModeCasePanel.bnOpen.text=&Open
|
||||
ReviewModeCasePanel.rbGroupLabel.text=Show Last 10:
|
||||
|
@ -526,6 +526,7 @@
|
||||
<!-- file chunk-specific fields (optional for others) -->
|
||||
<!-- for a parent file with no content, number of chunks are specified -->
|
||||
<field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
|
||||
<field name="chunk_size" type="int" indexed="true" stored="true" required="false" />
|
||||
|
||||
<!-- Common metadata fields, named specifically to match up with
|
||||
SolrCell metadata when parsing rich documents such as Word, PDF.
|
||||
|
@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskCoreException;
|
||||
* Extracts text from artifacts by concatenating the values of all of the
|
||||
* artifact's attributes.
|
||||
*/
|
||||
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
||||
class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
|
||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||
|
||||
/**
|
||||
@ -82,13 +82,16 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isDisabled() {
|
||||
public boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(BlackboardArtifact artifact) {
|
||||
private InputStream getInputStream(BlackboardArtifact artifact) {
|
||||
// Concatenate the string values of all attributes into a single
|
||||
// "content" string to be indexed.
|
||||
StringBuilder artifactContents = new StringBuilder();
|
||||
@ -127,17 +130,17 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
|
||||
return new InputStreamReader(stream, StandardCharsets.UTF_8);
|
||||
public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
|
||||
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
long getID(BlackboardArtifact source) {
|
||||
public long getID(BlackboardArtifact source) {
|
||||
return source.getArtifactID();
|
||||
}
|
||||
|
||||
@Override
|
||||
String getName(BlackboardArtifact source) {
|
||||
public String getName(BlackboardArtifact source) {
|
||||
return source.getDisplayName() + "_" + source.getArtifactID();
|
||||
}
|
||||
}
|
||||
|
@ -18,7 +18,6 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
@ -28,7 +27,7 @@ import org.sleuthkit.datamodel.AbstractFile;
|
||||
* Common methods for utilities that extract text and content and divide into
|
||||
* chunks
|
||||
*/
|
||||
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
|
||||
abstract class FileTextExtractor implements TextExtractor< AbstractFile> {
|
||||
|
||||
|
||||
static final List<String> BLOB_MIME_TYPES
|
||||
@ -96,17 +95,16 @@ abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
|
||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||
|
||||
@Override
|
||||
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
|
||||
public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;
|
||||
|
||||
@Override
|
||||
long getID(AbstractFile source) {
|
||||
public long getID(AbstractFile source) {
|
||||
return source.getId();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
String getName(AbstractFile source) {
|
||||
public String getName(AbstractFile source) {
|
||||
return source.getName();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -19,26 +19,28 @@
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.logging.Level;
|
||||
import net.htmlparser.jericho.Attributes;
|
||||
import net.htmlparser.jericho.Renderer;
|
||||
import net.htmlparser.jericho.Source;
|
||||
import net.htmlparser.jericho.StartTag;
|
||||
import net.htmlparser.jericho.StartTagType;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extractor of text from HTML supported AbstractFile content. Extracted text
|
||||
* will be * divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
||||
* chunks are indexed with Solr.
|
||||
* will be * divided into chunks and indexed with Solr. If HTML extraction
|
||||
* succeeds, chunks are indexed with Solr.
|
||||
*/
|
||||
class HtmlTextExtractor extends FileTextExtractor {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||
private static final int MAX_SIZE = 50_000_000; //50MB
|
||||
|
||||
static final List<String> WEB_MIME_TYPES = Arrays.asList(
|
||||
@ -63,7 +65,9 @@ class HtmlTextExtractor extends FileTextExtractor {
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
||||
|
||||
//Parse the stream with Jericho and put the results in a Reader
|
||||
try {
|
||||
StringBuilder scripts = new StringBuilder();
|
||||
@ -77,7 +81,7 @@ class HtmlTextExtractor extends FileTextExtractor {
|
||||
int numComments = 0;
|
||||
int numOthers = 0;
|
||||
|
||||
Source source = new Source(in);
|
||||
Source source = new Source(stream);
|
||||
source.fullSequentialParse();
|
||||
Renderer renderer = source.getRenderer();
|
||||
renderer.setNewLine("\n");
|
||||
@ -160,12 +164,11 @@ class HtmlTextExtractor extends FileTextExtractor {
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||
return new ReadContentInputStream(sourceFile1);
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isDisabled() {
|
||||
public boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
}
|
||||
|
@ -18,12 +18,15 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import com.google.common.base.Utf8;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.logging.Level;
|
||||
import javax.annotation.concurrent.NotThreadSafe;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.openide.util.NbBundle;
|
||||
@ -55,9 +58,7 @@ class Ingester {
|
||||
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
||||
private static Ingester instance;
|
||||
|
||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
|
||||
private static final int SINGLE_READ_CHARS = 1024;
|
||||
private static final int EXTRA_CHARS = 128;
|
||||
private static final int SINGLE_READ_CHARS = 512;
|
||||
|
||||
private Ingester() {
|
||||
}
|
||||
@ -121,6 +122,136 @@ class Ingester {
|
||||
return item.accept(SOLR_FIELDS_VISITOR);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the given TextExtractor to extract text from the given source. The
|
||||
* text will be chunked and each chunk passed to Solr to add to the index.
|
||||
*
|
||||
*
|
||||
* @param <A> The type of the Appendix provider that provides
|
||||
* additional text to append to the final chunk.
|
||||
* @param <T> A subclass of SleuthkitVisibleItem.
|
||||
* @param extractor The TextExtractor that will be used to extract text from
|
||||
* the given source.
|
||||
* @param source The source from which text will be extracted, chunked,
|
||||
* and indexed.
|
||||
* @param context The ingest job context that can be used to cancel this
|
||||
* process.
|
||||
*
|
||||
* @return True if this method executed normally. or False if there was an
|
||||
* unexpected exception. //JMTODO: This policy needs to be reviewed.
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
final long sourceID = extractor.getID(source);
|
||||
final String sourceName = extractor.getName(source);
|
||||
|
||||
int numChunks = 0; //unknown until chunking is done
|
||||
|
||||
if (extractor.isDisabled()) {
|
||||
/* some Extrctors, notable the strings extractor, have options which
|
||||
* can be configured such that no extraction should be done */
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<String, String> fields = getContentFields(source);
|
||||
//Get a reader for the content of the given source
|
||||
try (BufferedReader reader = new BufferedReader(extractor.getReader(source));) {
|
||||
Chunker chunker = new Chunker(reader);
|
||||
for (Chunk chunk : chunker) {
|
||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||
fields.put(Server.Schema.ID.toString(), chunkId);
|
||||
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
|
||||
try {
|
||||
//add the chunk text to Solr index
|
||||
indexChunk(chunk.toString(), sourceName, fields);
|
||||
numChunks++;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||
|
||||
throw ingEx; //need to rethrow to signal error and move on
|
||||
} catch (Exception ex) {
|
||||
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} catch (Exception ex) {
|
||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} finally {
|
||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
|
||||
indexChunk(null, sourceName, fields);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add one chunk as to the Solr index as a seperate sold document.
|
||||
*
|
||||
* TODO see if can use a byte or string streaming way to add content to
|
||||
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
|
||||
* 4.0.0), see if possible to stream with UpdateRequestHandler
|
||||
*
|
||||
* @param chunk The chunk content as a string
|
||||
* @param fields
|
||||
* @param size
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
|
||||
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
||||
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
||||
// but does this really mean we don't want to index it?
|
||||
|
||||
//skip the file, image id unknown
|
||||
//JMTODO: does this need to ne internationalized?
|
||||
String msg = NbBundle.getMessage(Ingester.class,
|
||||
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
|
||||
logger.log(Level.SEVERE, msg);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
|
||||
//Make a SolrInputDocument out of the field map
|
||||
SolrInputDocument updateDoc = new SolrInputDocument();
|
||||
for (String key : fields.keySet()) {
|
||||
updateDoc.addField(key, fields.get(key));
|
||||
}
|
||||
//add the content to the SolrInputDocument
|
||||
//JMTODO: can we just add it to the field map before passing that in?
|
||||
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
|
||||
|
||||
try {
|
||||
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
|
||||
solrServer.addDocument(updateDoc);
|
||||
uncommitedIngests = true;
|
||||
|
||||
} catch (KeywordSearchModuleException ex) {
|
||||
//JMTODO: does this need to ne internationalized?
|
||||
throw new IngesterException(
|
||||
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells Solr to commit (necessary before ingested files will appear in
|
||||
* searches)
|
||||
*/
|
||||
void commit() {
|
||||
try {
|
||||
solrServer.commit();
|
||||
uncommitedIngests = false;
|
||||
} catch (NoOpenCoreException | SolrServerException ex) {
|
||||
logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Visitor used to create fields to send to SOLR index.
|
||||
*/
|
||||
@ -222,192 +353,6 @@ class Ingester {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the given TextExtractor to extract text from the given source. The
|
||||
* text will be chunked and each chunk passed to Solr to add to the index.
|
||||
*
|
||||
*
|
||||
* @param <A> The type of the Appendix provider that provides
|
||||
* additional text to append to the final chunk.
|
||||
* @param <T> A subclass of SleuthkitVisibleItem.
|
||||
* @param extractor The TextExtractor that will be used to extract text from
|
||||
* the given source.
|
||||
* @param source The source from which text will be extracted, chunked,
|
||||
* and indexed.
|
||||
* @param context The ingest job context that can be used to cancel this
|
||||
* process.
|
||||
*
|
||||
* @return True if this method executed normally. or False if there was an
|
||||
* unexpected exception. //JMTODO: This policy needs to be reviewed.
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
final long sourceID = extractor.getID(source);
|
||||
final String sourceName = extractor.getName(source);
|
||||
|
||||
int numChunks = 0; //unknown until chunking is done
|
||||
|
||||
if (extractor.isDisabled()) {
|
||||
/* some Extrctors, notable the strings extractor, have options which
|
||||
* can be configured such that no extraction should be done */
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<String, String> fields = getContentFields(source);
|
||||
//Get a stream and a reader for that stream
|
||||
try (final InputStream stream = extractor.getInputStream(source);
|
||||
Reader reader = extractor.getReader(stream, source);) {
|
||||
|
||||
//we read max 1024 chars at time, this seems to max what some Readers would return
|
||||
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
|
||||
boolean eof = false; //have we read until the end of the file yet
|
||||
while (!eof) {
|
||||
int chunkSizeInChars = 0; // the size in chars of the chunk (so far)
|
||||
if (context != null && context.fileIngestIsCancelled()) {
|
||||
return true;
|
||||
}
|
||||
long charsRead = 0; // number of chars read in the most recent read operation
|
||||
//consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
|
||||
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
|
||||
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
|
||||
chunkSizeInChars += charsRead;
|
||||
}
|
||||
|
||||
if (charsRead == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
} else {
|
||||
chunkSizeInChars += charsRead;
|
||||
|
||||
//if we haven't reached the end of the file,
|
||||
//try to read char-by-char until whitespace to not break words
|
||||
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
|
||||
&& (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
|
||||
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
|
||||
chunkSizeInChars += charsRead;
|
||||
}
|
||||
if (charsRead == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
}
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(chunkSizeInChars)
|
||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
||||
|
||||
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
|
||||
|
||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||
fields.put(Server.Schema.ID.toString(), chunkId);
|
||||
try {
|
||||
//pass the chunk to method that adds it to Solr index
|
||||
indexChunk(sb.toString(), sourceName, fields);
|
||||
numChunks++;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||
|
||||
throw ingEx; //need to rethrow to signal error and move on
|
||||
} catch (Exception ex) {
|
||||
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} catch (Exception ex) {
|
||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} finally {
|
||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
|
||||
indexChunk(null, sourceName, fields);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
||||
* caret '^'
|
||||
*
|
||||
* @param sb the StringBuilder to sanitize
|
||||
*
|
||||
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
|
||||
* function?
|
||||
*/
|
||||
private static void sanitizeToUTF8(StringBuilder sb) {
|
||||
final int length = sb.length();
|
||||
|
||||
// Sanitize by replacing non-UTF-8 characters with caret '^'
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
||||
sb.replace(i, i + 1, "^");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add one chunk as to the Solr index as a seperate sold document.
|
||||
*
|
||||
* TODO see if can use a byte or string streaming way to add content to
|
||||
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
|
||||
* 4.0.0), see if possible to stream with UpdateRequestHandler
|
||||
*
|
||||
* @param chunk The chunk content as a string
|
||||
* @param fields
|
||||
* @param size
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
|
||||
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
||||
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
||||
// but does this really mean we don't want to index it?
|
||||
|
||||
//skip the file, image id unknown
|
||||
//JMTODO: does this need to ne internationalized?
|
||||
String msg = NbBundle.getMessage(Ingester.class,
|
||||
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
|
||||
logger.log(Level.SEVERE, msg);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
|
||||
//Make a SolrInputDocument out of the field map
|
||||
SolrInputDocument updateDoc = new SolrInputDocument();
|
||||
for (String key : fields.keySet()) {
|
||||
updateDoc.addField(key, fields.get(key));
|
||||
}
|
||||
//add the content to the SolrInputDocument
|
||||
//JMTODO: can we just add it to the field map before passing that in?
|
||||
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
|
||||
|
||||
try {
|
||||
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
|
||||
solrServer.addDocument(updateDoc);
|
||||
uncommitedIngests = true;
|
||||
} catch (KeywordSearchModuleException ex) {
|
||||
//JMTODO: does this need to ne internationalized?
|
||||
throw new IngesterException(
|
||||
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells Solr to commit (necessary before ingested files will appear in
|
||||
* searches)
|
||||
*/
|
||||
void commit() {
|
||||
try {
|
||||
solrServer.commit();
|
||||
uncommitedIngests = false;
|
||||
} catch (NoOpenCoreException | SolrServerException ex) {
|
||||
logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates that there was an error with the specific ingest operation, but
|
||||
* it's still okay to continue ingesting files.
|
||||
@ -425,3 +370,211 @@ class Ingester {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encapsulates the content chunking algorithm in an implementation of the
|
||||
* Iterator interface. Also implements Iterable so it can be used directly in a
|
||||
* for loop. The base chunk is the part of the chunk before the overlapping
|
||||
* window. The window will be included at the end of the current chunk as well
|
||||
* as at the beginning of the next chunk.
|
||||
*/
|
||||
@NotThreadSafe
|
||||
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||
|
||||
//Chunking algorithm paramaters-------------------------------------//
|
||||
/** the maximum size of a chunk, including the window. */
|
||||
private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes
|
||||
/** the minimum to read before we start the process of looking for
|
||||
* whitespace to break at and creating an overlapping window. */
|
||||
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
|
||||
/** The maximum size of the chunk, before the overlapping window, even if we
|
||||
* couldn't find whitespace to break at. */
|
||||
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
|
||||
/** The amount of text we will read through before we give up on finding
|
||||
* whitespace to break the chunk/window at. */
|
||||
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
|
||||
/** The number of characters to read in one go from the Reader. */
|
||||
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
|
||||
|
||||
////chunker state--------------------------------------------///
|
||||
/** The Reader that this chunk reads from, and divides into chunks. It must
|
||||
* be a buffered reader to ensure that mark/reset are supported. */
|
||||
private final BufferedReader reader;
|
||||
/** The local buffer of characters read from the Reader. */
|
||||
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
|
||||
/** number of chars read in the most recent read operation. */
|
||||
private int charsRead = 0;
|
||||
|
||||
/** The text of the current chunk (so far). */
|
||||
private StringBuilder currentChunk;
|
||||
/** the size in bytes of the chunk (so far). */
|
||||
private int chunkSizeBytes = 0;
|
||||
/** the size in chars of the (base) chunk (so far). */
|
||||
private int baseChunkSizeChars;
|
||||
|
||||
/** has the chunker found whitespace to break on? */
|
||||
private boolean whitespaceFound = false;
|
||||
/** has the chunker reached the end of the Reader? If so, there are no more
|
||||
* chunks, and the current chunk does not need a window. */
|
||||
private boolean endOfReaderReached = false;
|
||||
|
||||
/**
|
||||
* Create a Chunker that will chunk the content of the given Reader.
|
||||
*
|
||||
* @param reader The content to chunk.
|
||||
*/
|
||||
Chunker(BufferedReader reader) {
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Chunk> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return endOfReaderReached == false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
||||
* caret '^'
|
||||
*
|
||||
* @param sb the StringBuilder to sanitize
|
||||
*
|
||||
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
|
||||
* function?
|
||||
*/
|
||||
private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
|
||||
final int length = sb.length();
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
||||
sb.replace(i, i + 1, "^");
|
||||
}
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Chunk next() {
|
||||
if (endOfReaderReached) {
|
||||
throw new NoSuchElementException("There are no more chunks.");
|
||||
}
|
||||
//reset state for the next chunk
|
||||
currentChunk = new StringBuilder();
|
||||
chunkSizeBytes = 0;
|
||||
baseChunkSizeChars = 0;
|
||||
|
||||
try {
|
||||
readBaseChunk();
|
||||
baseChunkSizeChars = currentChunk.length();
|
||||
reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk
|
||||
readWindow();
|
||||
} catch (IOException ioEx) {
|
||||
throw new RuntimeException("IOException while reading chunk.", ioEx);
|
||||
}
|
||||
try {
|
||||
reader.reset(); //reset the reader the so the next chunk can begin at the position marked above
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("IOException while resetting chunk reader.", ex);
|
||||
}
|
||||
|
||||
if (endOfReaderReached) {
|
||||
/* if we have reached the end of the content,we won't make another
|
||||
* overlapping chunk, so the base chunk can be extended to the end. */
|
||||
baseChunkSizeChars = currentChunk.length();
|
||||
}
|
||||
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
||||
return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the base chunk from the reader, and attempt to break at whitespace.
|
||||
*
|
||||
* @throws IOException if there is a problem reading from the reader.
|
||||
*/
|
||||
private void readBaseChunk() throws IOException {
|
||||
//read the chunk until the minimum base chunk size
|
||||
readHelper(MINIMUM_BASE_CHUNK_SIZE, false);
|
||||
//keep reading until the maximum base chunk size or white space is reached.
|
||||
whitespaceFound = false;
|
||||
readHelper(MAXIMUM_BASE_CHUNK_SIZE, true);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the window from the reader, and attempt to break at whitespace.
|
||||
*
|
||||
* @throws IOException if there is a problem reading from the reader.
|
||||
*/
|
||||
private void readWindow() throws IOException {
|
||||
//read the window, leaving some room to look for white space to break at.
|
||||
int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024);
|
||||
readHelper(windowEnd, false);
|
||||
whitespaceFound = false;
|
||||
//keep reading until the max chunk size, or until whitespace is reached.
|
||||
windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024);
|
||||
readHelper(windowEnd, true);
|
||||
}
|
||||
|
||||
/** Helper method that implements reading in a loop.
|
||||
*
|
||||
* @param maxBytes The max cummulative length of the content,in
|
||||
* bytes, to read from the Reader. That is, when
|
||||
* chunkSizeBytes >= maxBytes stop reading.
|
||||
* @param inWhiteSpaceBuffer Should the current read stop once whitespace is
|
||||
* found?
|
||||
*
|
||||
* @throws IOException If there is a problem reading from the Reader.
|
||||
*/
|
||||
private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException {
|
||||
//only read one character at a time if we are looking for whitespace.
|
||||
final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE;
|
||||
|
||||
//read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader.
|
||||
while ((chunkSizeBytes < maxBytes)
|
||||
&& (false == (inWhiteSpaceBuffer && whitespaceFound))
|
||||
&& (endOfReaderReached == false)) {
|
||||
charsRead = reader.read(tempChunkBuf, 0, readSize);
|
||||
if (-1 == charsRead) {
|
||||
//this is the last chunk
|
||||
endOfReaderReached = true;
|
||||
} else {
|
||||
if (inWhiteSpaceBuffer) {
|
||||
//chec for whitespace.
|
||||
whitespaceFound = Character.isWhitespace(tempChunkBuf[0]);
|
||||
}
|
||||
|
||||
//add read chars to the chunk and update the length.
|
||||
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
||||
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
||||
currentChunk.append(chunkSegment);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents one chunk as the text in it and the length of the base chunk, in
|
||||
* chars.
|
||||
*/
|
||||
class Chunk {
|
||||
|
||||
private final StringBuilder sb;
|
||||
private final int chunksize;
|
||||
|
||||
Chunk(StringBuilder sb, int baseChunkLength) {
|
||||
this.sb = sb;
|
||||
this.chunksize = baseChunkLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
int getBaseChunkLength() {
|
||||
return chunksize;
|
||||
}
|
||||
}
|
||||
|
@ -149,6 +149,12 @@ public class Server {
|
||||
return "num_chunks"; //NON-NLS
|
||||
}
|
||||
},
|
||||
CHUNK_SIZE {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "chunk_size"; //NON-NLS
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
|
||||
|
@ -25,6 +25,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
@ -37,6 +38,8 @@ import org.sleuthkit.datamodel.TskException;
|
||||
*/
|
||||
class StringsTextExtractor extends FileTextExtractor {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
||||
|
||||
/**
|
||||
* Options for this extractor
|
||||
*/
|
||||
@ -91,7 +94,12 @@ class StringsTextExtractor extends FileTextExtractor {
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isDisabled() {
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
@ -99,11 +107,11 @@ class StringsTextExtractor extends FileTextExtractor {
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
InputStream stringStream = getInputStream(sourceFile);
|
||||
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile) {
|
||||
//check which extract stream to use
|
||||
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
|
||||
|
@ -18,10 +18,7 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
|
||||
/**
|
||||
@ -31,9 +28,8 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
|
||||
* is able to process.
|
||||
*/
|
||||
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||
interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
|
||||
|
||||
/**
|
||||
* Is this extractor configured such that no extraction will/should be done?
|
||||
@ -48,18 +44,8 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||
* @param msg
|
||||
* @param ex
|
||||
*/
|
||||
void logWarning(String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
abstract void logWarning(String msg, Exception ex);
|
||||
|
||||
/**
|
||||
* Get an input stream over the content of the given source.
|
||||
*
|
||||
* @param source
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
abstract InputStream getInputStream(TextSource source);
|
||||
|
||||
/**
|
||||
* Get a reader that over the text extracted from the given source.
|
||||
@ -71,7 +57,7 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
|
||||
abstract Reader getReader(TextSource source) throws Ingester.IngesterException;
|
||||
|
||||
/**
|
||||
* Get the 'object' id of the given source.
|
||||
|
@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import com.google.common.io.CharSource;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
import java.util.MissingResourceException;
|
||||
@ -36,6 +35,7 @@ import org.apache.tika.Tika;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
@ -51,22 +51,25 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
*/
|
||||
class TikaTextExtractor extends FileTextExtractor {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
|
||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||
|
||||
private static final List<String> TIKA_SUPPORTED_TYPES
|
||||
= new Tika().getParser().getSupportedTypes(new ParseContext())
|
||||
.parallelStream()
|
||||
.stream()
|
||||
.map(mt -> mt.getType() + "/" + mt.getSubtype())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@Override
|
||||
void logWarning(final String msg, Exception ex) {
|
||||
public void logWarning(final String msg, Exception ex) {
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
||||
super.logWarning(msg, ex);
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
||||
public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
||||
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
//Parse the file in a task, a convenient way to have a timeout...
|
||||
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
|
||||
@ -125,13 +128,9 @@ class TikaTextExtractor extends FileTextExtractor {
|
||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||
return new ReadContentInputStream(sourceFile1);
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isDisabled() {
|
||||
public boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user