mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 10:17:41 +00:00
Merge develop into search_improvements with 32K chunks
This commit is contained in:
commit
eb8422ca1e
@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskCoreException;
|
|||||||
* Extracts text from artifacts by concatenating the values of all of the
|
* Extracts text from artifacts by concatenating the values of all of the
|
||||||
* artifact's attributes.
|
* artifact's attributes.
|
||||||
*/
|
*/
|
||||||
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
class ArtifactTextExtractor implements TextExtractor<BlackboardArtifact> {
|
||||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -82,13 +82,16 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean isDisabled() {
|
public boolean isDisabled() {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void logWarning(final String msg, Exception ex) {
|
||||||
|
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private InputStream getInputStream(BlackboardArtifact artifact) {
|
||||||
@Override
|
|
||||||
InputStream getInputStream(BlackboardArtifact artifact) {
|
|
||||||
// Concatenate the string values of all attributes into a single
|
// Concatenate the string values of all attributes into a single
|
||||||
// "content" string to be indexed.
|
// "content" string to be indexed.
|
||||||
StringBuilder artifactContents = new StringBuilder();
|
StringBuilder artifactContents = new StringBuilder();
|
||||||
@ -127,17 +130,17 @@ public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
|
public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException {
|
||||||
return new InputStreamReader(stream, StandardCharsets.UTF_8);
|
return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
long getID(BlackboardArtifact source) {
|
public long getID(BlackboardArtifact source) {
|
||||||
return source.getArtifactID();
|
return source.getArtifactID();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
String getName(BlackboardArtifact source) {
|
public String getName(BlackboardArtifact source) {
|
||||||
return source.getDisplayName() + "_" + source.getArtifactID();
|
return source.getDisplayName() + "_" + source.getArtifactID();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,6 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -28,7 +27,7 @@ import org.sleuthkit.datamodel.AbstractFile;
|
|||||||
* Common methods for utilities that extract text and content and divide into
|
* Common methods for utilities that extract text and content and divide into
|
||||||
* chunks
|
* chunks
|
||||||
*/
|
*/
|
||||||
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
|
abstract class FileTextExtractor implements TextExtractor< AbstractFile> {
|
||||||
|
|
||||||
|
|
||||||
static final List<String> BLOB_MIME_TYPES
|
static final List<String> BLOB_MIME_TYPES
|
||||||
@ -96,17 +95,16 @@ abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
|
|||||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
|
public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
long getID(AbstractFile source) {
|
public long getID(AbstractFile source) {
|
||||||
return source.getId();
|
return source.getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
String getName(AbstractFile source) {
|
public String getName(AbstractFile source) {
|
||||||
return source.getName();
|
return source.getName();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -19,16 +19,17 @@
|
|||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.logging.Level;
|
||||||
import net.htmlparser.jericho.Attributes;
|
import net.htmlparser.jericho.Attributes;
|
||||||
import net.htmlparser.jericho.Renderer;
|
import net.htmlparser.jericho.Renderer;
|
||||||
import net.htmlparser.jericho.Source;
|
import net.htmlparser.jericho.Source;
|
||||||
import net.htmlparser.jericho.StartTag;
|
import net.htmlparser.jericho.StartTag;
|
||||||
import net.htmlparser.jericho.StartTagType;
|
import net.htmlparser.jericho.StartTagType;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
@ -37,6 +38,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
*/
|
*/
|
||||||
class HtmlTextExtractor extends FileTextExtractor {
|
class HtmlTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
|
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||||
private static final int MAX_SIZE = 50_000_000; //50MB
|
private static final int MAX_SIZE = 50_000_000; //50MB
|
||||||
|
|
||||||
static final List<String> WEB_MIME_TYPES = Arrays.asList(
|
static final List<String> WEB_MIME_TYPES = Arrays.asList(
|
||||||
@ -61,7 +63,9 @@ class HtmlTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
|
public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||||
|
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
|
|
||||||
//Parse the stream with Jericho and put the results in a Reader
|
//Parse the stream with Jericho and put the results in a Reader
|
||||||
try {
|
try {
|
||||||
StringBuilder scripts = new StringBuilder();
|
StringBuilder scripts = new StringBuilder();
|
||||||
@ -75,7 +79,7 @@ class HtmlTextExtractor extends FileTextExtractor {
|
|||||||
int numComments = 0;
|
int numComments = 0;
|
||||||
int numOthers = 0;
|
int numOthers = 0;
|
||||||
|
|
||||||
Source source = new Source(in);
|
Source source = new Source(stream);
|
||||||
source.fullSequentialParse();
|
source.fullSequentialParse();
|
||||||
Renderer renderer = source.getRenderer();
|
Renderer renderer = source.getRenderer();
|
||||||
renderer.setNewLine("\n");
|
renderer.setNewLine("\n");
|
||||||
@ -158,12 +162,11 @@ class HtmlTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
public boolean isDisabled() {
|
||||||
return new ReadContentInputStream(sourceFile1);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
boolean isDisabled() {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void logWarning(final String msg, Exception ex) {
|
||||||
|
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,11 +18,13 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import com.google.common.base.Utf8;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
import org.apache.solr.common.SolrInputDocument;
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
@ -54,9 +56,6 @@ class Ingester {
|
|||||||
private final Server solrServer = KeywordSearch.getServer();
|
private final Server solrServer = KeywordSearch.getServer();
|
||||||
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
||||||
private static Ingester instance;
|
private static Ingester instance;
|
||||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
|
|
||||||
private static final int SINGLE_READ_CHARS = 1024;
|
|
||||||
private static final int EXTRA_CHARS = 128;
|
|
||||||
|
|
||||||
private Ingester() {
|
private Ingester() {
|
||||||
}
|
}
|
||||||
@ -120,6 +119,136 @@ class Ingester {
|
|||||||
return item.accept(SOLR_FIELDS_VISITOR);
|
return item.accept(SOLR_FIELDS_VISITOR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use the given TextExtractor to extract text from the given source. The
|
||||||
|
* text will be chunked and each chunk passed to Solr to add to the index.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @param <A> The type of the Appendix provider that provides
|
||||||
|
* additional text to append to the final chunk.
|
||||||
|
* @param <T> A subclass of SleuthkitVisibleItem.
|
||||||
|
* @param extractor The TextExtractor that will be used to extract text from
|
||||||
|
* the given source.
|
||||||
|
* @param source The source from which text will be extracted, chunked,
|
||||||
|
* and indexed.
|
||||||
|
* @param context The ingest job context that can be used to cancel this
|
||||||
|
* process.
|
||||||
|
*
|
||||||
|
* @return True if this method executed normally. or False if there was an
|
||||||
|
* unexpected exception. //JMTODO: This policy needs to be reviewed.
|
||||||
|
*
|
||||||
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
|
*/
|
||||||
|
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||||
|
final long sourceID = extractor.getID(source);
|
||||||
|
final String sourceName = extractor.getName(source);
|
||||||
|
|
||||||
|
int numChunks = 0; //unknown until chunking is done
|
||||||
|
|
||||||
|
if (extractor.isDisabled()) {
|
||||||
|
/* some Extrctors, notable the strings extractor, have options which
|
||||||
|
* can be configured such that no extraction should be done */
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, String> fields = getContentFields(source);
|
||||||
|
//Get a reader for the content of the given source
|
||||||
|
try (Reader reader = extractor.getReader(source);) {
|
||||||
|
Chunker chunker = new Chunker(reader);
|
||||||
|
|
||||||
|
for (Chunk chunk : chunker) {
|
||||||
|
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||||
|
fields.put(Server.Schema.ID.toString(), chunkId);
|
||||||
|
try {
|
||||||
|
//add the chunk text to Solr index
|
||||||
|
indexChunk(chunk.getText().toString(), sourceName, fields);
|
||||||
|
numChunks++;
|
||||||
|
} catch (Ingester.IngesterException ingEx) {
|
||||||
|
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||||
|
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||||
|
|
||||||
|
throw ingEx; //need to rethrow to signal error and move on
|
||||||
|
} catch (Exception ex) {
|
||||||
|
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException ex) {
|
||||||
|
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
|
return false;
|
||||||
|
} catch (Exception ex) {
|
||||||
|
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
|
return false;
|
||||||
|
} finally {
|
||||||
|
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||||
|
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||||
|
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
|
||||||
|
indexChunk(null, sourceName, fields);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add one chunk as to the Solr index as a seperate sold document.
|
||||||
|
*
|
||||||
|
* TODO see if can use a byte or string streaming way to add content to
|
||||||
|
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
|
||||||
|
* 4.0.0), see if possible to stream with UpdateRequestHandler
|
||||||
|
*
|
||||||
|
* @param chunk The chunk content as a string
|
||||||
|
* @param fields
|
||||||
|
* @param size
|
||||||
|
*
|
||||||
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
|
*/
|
||||||
|
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
|
||||||
|
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
||||||
|
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
||||||
|
// but does this really mean we don't want to index it?
|
||||||
|
|
||||||
|
//skip the file, image id unknown
|
||||||
|
//JMTODO: does this need to ne internationalized?
|
||||||
|
String msg = NbBundle.getMessage(Ingester.class,
|
||||||
|
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
|
||||||
|
logger.log(Level.SEVERE, msg);
|
||||||
|
throw new IngesterException(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Make a SolrInputDocument out of the field map
|
||||||
|
SolrInputDocument updateDoc = new SolrInputDocument();
|
||||||
|
for (String key : fields.keySet()) {
|
||||||
|
updateDoc.addField(key, fields.get(key));
|
||||||
|
}
|
||||||
|
//add the content to the SolrInputDocument
|
||||||
|
//JMTODO: can we just add it to the field map before passing that in?
|
||||||
|
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
|
||||||
|
|
||||||
|
try {
|
||||||
|
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
|
||||||
|
solrServer.addDocument(updateDoc);
|
||||||
|
uncommitedIngests = true;
|
||||||
|
|
||||||
|
} catch (KeywordSearchModuleException ex) {
|
||||||
|
//JMTODO: does this need to ne internationalized?
|
||||||
|
throw new IngesterException(
|
||||||
|
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tells Solr to commit (necessary before ingested files will appear in
|
||||||
|
* searches)
|
||||||
|
*/
|
||||||
|
void commit() {
|
||||||
|
try {
|
||||||
|
solrServer.commit();
|
||||||
|
uncommitedIngests = false;
|
||||||
|
} catch (NoOpenCoreException | SolrServerException ex) {
|
||||||
|
logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Visitor used to create fields to send to SOLR index.
|
* Visitor used to create fields to send to SOLR index.
|
||||||
*/
|
*/
|
||||||
@ -221,192 +350,6 @@ class Ingester {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Use the given TextExtractor to extract text from the given source. The
|
|
||||||
* text will be chunked and each chunk passed to Solr to add to the index.
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* @param <A> The type of the Appendix provider that provides
|
|
||||||
* additional text to append to the final chunk.
|
|
||||||
* @param <T> A subclass of SleuthkitVisibleItem.
|
|
||||||
* @param extractor The TextExtractor that will be used to extract text from
|
|
||||||
* the given source.
|
|
||||||
* @param source The source from which text will be extracted, chunked,
|
|
||||||
* and indexed.
|
|
||||||
* @param context The ingest job context that can be used to cancel this
|
|
||||||
* process.
|
|
||||||
*
|
|
||||||
* @return True if this method executed normally. or False if there was an
|
|
||||||
* unexpected exception. //JMTODO: This policy needs to be reviewed.
|
|
||||||
*
|
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
|
||||||
*/
|
|
||||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
|
||||||
final long sourceID = extractor.getID(source);
|
|
||||||
final String sourceName = extractor.getName(source);
|
|
||||||
|
|
||||||
int numChunks = 0; //unknown until chunking is done
|
|
||||||
|
|
||||||
if (extractor.isDisabled()) {
|
|
||||||
/* some Extrctors, notable the strings extractor, have options which
|
|
||||||
* can be configured such that no extraction should be done */
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Map<String, String> fields = getContentFields(source);
|
|
||||||
//Get a stream and a reader for that stream
|
|
||||||
try (final InputStream stream = extractor.getInputStream(source);
|
|
||||||
Reader reader = extractor.getReader(stream, source);) {
|
|
||||||
|
|
||||||
//we read max 1024 chars at time, this seems to max what some Readers would return
|
|
||||||
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
|
||||||
|
|
||||||
boolean eof = false; //have we read until the end of the file yet
|
|
||||||
while (!eof) {
|
|
||||||
int chunkSizeInChars = 0; // the size in chars of the chunk (so far)
|
|
||||||
if (context != null && context.fileIngestIsCancelled()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
long charsRead = 0; // number of chars read in the most recent read operation
|
|
||||||
//consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
|
|
||||||
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
|
|
||||||
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
|
|
||||||
chunkSizeInChars += charsRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (charsRead == -1) {
|
|
||||||
//this is the last chunk
|
|
||||||
eof = true;
|
|
||||||
} else {
|
|
||||||
chunkSizeInChars += charsRead;
|
|
||||||
|
|
||||||
//if we haven't reached the end of the file,
|
|
||||||
//try to read char-by-char until whitespace to not break words
|
|
||||||
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
|
|
||||||
&& (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
|
|
||||||
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
|
|
||||||
chunkSizeInChars += charsRead;
|
|
||||||
}
|
|
||||||
if (charsRead == -1) {
|
|
||||||
//this is the last chunk
|
|
||||||
eof = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder(chunkSizeInChars)
|
|
||||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
|
||||||
|
|
||||||
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
|
|
||||||
|
|
||||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
|
||||||
fields.put(Server.Schema.ID.toString(), chunkId);
|
|
||||||
try {
|
|
||||||
//pass the chunk to method that adds it to Solr index
|
|
||||||
indexChunk(sb.toString(), sourceName, fields);
|
|
||||||
numChunks++;
|
|
||||||
} catch (Ingester.IngesterException ingEx) {
|
|
||||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
|
||||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
|
||||||
|
|
||||||
throw ingEx; //need to rethrow to signal error and move on
|
|
||||||
} catch (Exception ex) {
|
|
||||||
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException ex) {
|
|
||||||
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
|
||||||
return false;
|
|
||||||
} catch (Exception ex) {
|
|
||||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
|
||||||
return false;
|
|
||||||
} finally {
|
|
||||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
|
||||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
|
||||||
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
|
|
||||||
indexChunk(null, sourceName, fields);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
|
||||||
* caret '^'
|
|
||||||
*
|
|
||||||
* @param sb the StringBuilder to sanitize
|
|
||||||
*
|
|
||||||
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
|
|
||||||
* function?
|
|
||||||
*/
|
|
||||||
private static void sanitizeToUTF8(StringBuilder sb) {
|
|
||||||
final int length = sb.length();
|
|
||||||
|
|
||||||
// Sanitize by replacing non-UTF-8 characters with caret '^'
|
|
||||||
for (int i = 0; i < length; i++) {
|
|
||||||
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
|
||||||
sb.replace(i, i + 1, "^");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add one chunk as to the Solr index as a seperate sold document.
|
|
||||||
*
|
|
||||||
* TODO see if can use a byte or string streaming way to add content to
|
|
||||||
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
|
|
||||||
* 4.0.0), see if possible to stream with UpdateRequestHandler
|
|
||||||
*
|
|
||||||
* @param chunk The chunk content as a string
|
|
||||||
* @param fields
|
|
||||||
* @param size
|
|
||||||
*
|
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
|
||||||
*/
|
|
||||||
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
|
|
||||||
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
|
||||||
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
|
||||||
// but does this really mean we don't want to index it?
|
|
||||||
|
|
||||||
//skip the file, image id unknown
|
|
||||||
//JMTODO: does this need to ne internationalized?
|
|
||||||
String msg = NbBundle.getMessage(Ingester.class,
|
|
||||||
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
|
|
||||||
logger.log(Level.SEVERE, msg);
|
|
||||||
throw new IngesterException(msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
//Make a SolrInputDocument out of the field map
|
|
||||||
SolrInputDocument updateDoc = new SolrInputDocument();
|
|
||||||
for (String key : fields.keySet()) {
|
|
||||||
updateDoc.addField(key, fields.get(key));
|
|
||||||
}
|
|
||||||
//add the content to the SolrInputDocument
|
|
||||||
//JMTODO: can we just add it to the field map before passing that in?
|
|
||||||
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
|
|
||||||
|
|
||||||
try {
|
|
||||||
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
|
|
||||||
solrServer.addDocument(updateDoc);
|
|
||||||
uncommitedIngests = true;
|
|
||||||
} catch (KeywordSearchModuleException ex) {
|
|
||||||
//JMTODO: does this need to ne internationalized?
|
|
||||||
throw new IngesterException(
|
|
||||||
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tells Solr to commit (necessary before ingested files will appear in
|
|
||||||
* searches)
|
|
||||||
*/
|
|
||||||
void commit() {
|
|
||||||
try {
|
|
||||||
solrServer.commit();
|
|
||||||
uncommitedIngests = false;
|
|
||||||
} catch (NoOpenCoreException | SolrServerException ex) {
|
|
||||||
logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Indicates that there was an error with the specific ingest operation, but
|
* Indicates that there was an error with the specific ingest operation, but
|
||||||
* it's still okay to continue ingesting files.
|
* it's still okay to continue ingesting files.
|
||||||
@ -423,4 +366,146 @@ class Ingester {
|
|||||||
super(message);
|
super(message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class Chunk {
|
||||||
|
private final StringBuilder sb;
|
||||||
|
private final int chunksize;
|
||||||
|
|
||||||
|
Chunk(StringBuilder sb, int chunksize) {
|
||||||
|
this.sb = sb;
|
||||||
|
this.chunksize = chunksize;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder getText() {
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
|
|
||||||
|
int getSize() {
|
||||||
|
return chunksize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encapsulates the content chunking algorithm in implementation of the Iterator
|
||||||
|
* interface.
|
||||||
|
*/
|
||||||
|
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
||||||
|
|
||||||
|
private static final int INITIAL_CHUNK_SIZE = 32 * 1024; //bytes
|
||||||
|
private static final int SINGLE_READ_CHARS = 1024;
|
||||||
|
|
||||||
|
private int chunkSizeBytes = 0; // the size in bytes of chunk (so far)
|
||||||
|
private int charsRead = 0; // number of chars read in the most recent read operation
|
||||||
|
private boolean whitespace = false;
|
||||||
|
private char[] tempChunkBuf;
|
||||||
|
private StringBuilder chunkText;
|
||||||
|
private boolean endOfContent = false;
|
||||||
|
private final Reader reader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a Chunker that will chunk the content of the given Reader.
|
||||||
|
*
|
||||||
|
* @param reader The content to chunk.
|
||||||
|
*/
|
||||||
|
Chunker(Reader reader) {
|
||||||
|
this.reader = reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<Chunk> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Are there any more chunks available from this chunker?
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @return true if there are more chunks available.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return endOfContent == false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Chunk next() {
|
||||||
|
if (hasNext()) {
|
||||||
|
chunkText = new StringBuilder();
|
||||||
|
tempChunkBuf = new char[SINGLE_READ_CHARS];
|
||||||
|
chunkSizeBytes = 0;
|
||||||
|
//read chars up to initial chunk size
|
||||||
|
while (chunkSizeBytes < INITIAL_CHUNK_SIZE && endOfContent == false) {
|
||||||
|
try {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
throw new RuntimeException("IOException while attempting to read chunk.", ex);
|
||||||
|
}
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
endOfContent = true;
|
||||||
|
} else {
|
||||||
|
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
||||||
|
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
||||||
|
chunkText.append(chunkSegment);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if (false == endOfContent) {
|
||||||
|
endOfContent = readChunkUntilWhiteSpace();
|
||||||
|
}
|
||||||
|
return new Chunk(sanitizeToUTF8(chunkText), chunkSizeBytes);
|
||||||
|
} else {
|
||||||
|
throw new NoSuchElementException("There are no more chunks.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean readChunkUntilWhiteSpace() {
|
||||||
|
charsRead = 0;
|
||||||
|
whitespace = false;
|
||||||
|
//if we haven't reached the end of the file,
|
||||||
|
//try to read char-by-char until whitespace to not break words
|
||||||
|
while ((chunkSizeBytes < INITIAL_CHUNK_SIZE)
|
||||||
|
&& (false == whitespace)) {
|
||||||
|
try {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, 1);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
throw new RuntimeException("IOException while attempting to read chunk until whitespace.", ex);
|
||||||
|
}
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
whitespace = Character.isWhitespace(tempChunkBuf[0]);
|
||||||
|
String chunkSegment = new String(tempChunkBuf, 0, 1);
|
||||||
|
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
||||||
|
chunkText.append(chunkSegment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
||||||
|
* caret '^'
|
||||||
|
*
|
||||||
|
* @param sb the StringBuilder to sanitize
|
||||||
|
*
|
||||||
|
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
|
||||||
|
* function?
|
||||||
|
*/
|
||||||
|
private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
|
||||||
|
final int length = sb.length();
|
||||||
|
|
||||||
|
// Sanitize by replacing non-UTF-8 characters with caret '^'
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
||||||
|
sb.replace(i, i + 1, "^");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -157,7 +157,7 @@ public class Server {
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return "num_chunks"; //NON-NLS
|
return "num_chunks"; //NON-NLS
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
|
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
|
||||||
|
@ -19,21 +19,21 @@
|
|||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
|
||||||
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
|
||||||
import org.openide.util.NbBundle;
|
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.MissingResourceException;
|
import java.util.MissingResourceException;
|
||||||
import org.sleuthkit.autopsy.core.RuntimeProperties;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
|
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
||||||
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.util.lookup.ServiceProvider;
|
import org.openide.util.lookup.ServiceProvider;
|
||||||
import org.openide.util.lookup.ServiceProviders;
|
import org.openide.util.lookup.ServiceProviders;
|
||||||
import org.sleuthkit.autopsy.casemodule.Case;
|
import org.sleuthkit.autopsy.casemodule.Case;
|
||||||
|
import org.sleuthkit.autopsy.core.RuntimeProperties;
|
||||||
|
import org.sleuthkit.autopsy.corecomponentinterfaces.AutopsyService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
import org.sleuthkit.autopsy.corecomponentinterfaces.AutopsyService;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An implementation of the KeywordSearchService interface that uses Solr for
|
* An implementation of the KeywordSearchService interface that uses Solr for
|
||||||
@ -48,6 +48,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService
|
|||||||
private static final String BAD_IP_ADDRESS_FORMAT = "ioexception occurred when talking to server"; //NON-NLS
|
private static final String BAD_IP_ADDRESS_FORMAT = "ioexception occurred when talking to server"; //NON-NLS
|
||||||
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
|
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
|
||||||
private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
|
private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
|
||||||
|
private static final String SERVICE_NAME = "Solr Keyword Search Service";
|
||||||
|
|
||||||
ArtifactTextExtractor extractor = new ArtifactTextExtractor();
|
ArtifactTextExtractor extractor = new ArtifactTextExtractor();
|
||||||
|
|
||||||
@ -210,4 +211,9 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService
|
|||||||
* Autopsy service providers may not have case-level resources.
|
* Autopsy service providers may not have case-level resources.
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getServiceName() {
|
||||||
|
return SERVICE_NAME;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,6 +25,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.logging.Level;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||||
@ -37,6 +38,8 @@ import org.sleuthkit.datamodel.TskException;
|
|||||||
*/
|
*/
|
||||||
class StringsTextExtractor extends FileTextExtractor {
|
class StringsTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
|
static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Options for this extractor
|
* Options for this extractor
|
||||||
*/
|
*/
|
||||||
@ -92,7 +95,12 @@ class StringsTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean isDisabled() {
|
public void logWarning(final String msg, Exception ex) {
|
||||||
|
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isDisabled() {
|
||||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
|
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
|
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
@ -100,11 +108,11 @@ class StringsTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
|
public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||||
|
InputStream stringStream = getInputStream(sourceFile);
|
||||||
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
InputStream getInputStream(AbstractFile sourceFile) {
|
InputStream getInputStream(AbstractFile sourceFile) {
|
||||||
//check which extract stream to use
|
//check which extract stream to use
|
||||||
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
|
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
|
||||||
|
@ -18,10 +18,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.logging.Level;
|
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
|
||||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -31,9 +28,8 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
|||||||
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
|
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
|
||||||
* is able to process.
|
* is able to process.
|
||||||
*/
|
*/
|
||||||
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
interface TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||||
|
|
||||||
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is this extractor configured such that no extraction will/should be done?
|
* Is this extractor configured such that no extraction will/should be done?
|
||||||
@ -48,18 +44,8 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
|||||||
* @param msg
|
* @param msg
|
||||||
* @param ex
|
* @param ex
|
||||||
*/
|
*/
|
||||||
void logWarning(String msg, Exception ex) {
|
abstract void logWarning(String msg, Exception ex);
|
||||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get an input stream over the content of the given source.
|
|
||||||
*
|
|
||||||
* @param source
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
abstract InputStream getInputStream(TextSource source);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get a reader that over the text extracted from the given source.
|
* Get a reader that over the text extracted from the given source.
|
||||||
@ -71,7 +57,7 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
|||||||
*
|
*
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
|
abstract Reader getReader(TextSource source) throws Ingester.IngesterException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the 'object' id of the given source.
|
* Get the 'object' id of the given source.
|
||||||
|
@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch;
|
|||||||
|
|
||||||
import com.google.common.io.CharSource;
|
import com.google.common.io.CharSource;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.MissingResourceException;
|
import java.util.MissingResourceException;
|
||||||
@ -36,6 +35,7 @@ import org.apache.tika.Tika;
|
|||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
@ -46,6 +46,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
*/
|
*/
|
||||||
class TikaTextExtractor extends FileTextExtractor {
|
class TikaTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
|
static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
|
||||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||||
|
|
||||||
private static final List<String> TIKA_SUPPORTED_TYPES
|
private static final List<String> TIKA_SUPPORTED_TYPES
|
||||||
@ -55,13 +56,15 @@ class TikaTextExtractor extends FileTextExtractor {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void logWarning(final String msg, Exception ex) {
|
public void logWarning(final String msg, Exception ex) {
|
||||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
||||||
super.logWarning(msg, ex);
|
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
||||||
|
ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
|
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
//Parse the file in a task, a convenient way to have a timeout...
|
//Parse the file in a task, a convenient way to have a timeout...
|
||||||
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
|
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
|
||||||
@ -117,13 +120,9 @@ class TikaTextExtractor extends FileTextExtractor {
|
|||||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
|
||||||
return new ReadContentInputStream(sourceFile1);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean isDisabled() {
|
public boolean isDisabled() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user