diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java index fc8ef12b05..07657f9646 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java @@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskCoreException; * Extracts text from artifacts by concatenating the values of all of the * artifact's attributes. */ -public class ArtifactTextExtractor extends TextExtractor { +class ArtifactTextExtractor implements TextExtractor { static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName()); /** @@ -82,13 +82,16 @@ public class ArtifactTextExtractor extends TextExtractor { } @Override - boolean isDisabled() { + public boolean isDisabled() { return false; + } + + @Override + public void logWarning(final String msg, Exception ex) { + logger.log(Level.WARNING, msg, ex); //NON-NLS } } - - @Override - InputStream getInputStream(BlackboardArtifact artifact) { + private InputStream getInputStream(BlackboardArtifact artifact) { // Concatenate the string values of all attributes into a single // "content" string to be indexed. StringBuilder artifactContents = new StringBuilder(); @@ -127,17 +130,17 @@ public class ArtifactTextExtractor extends TextExtractor { } @Override - Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException { - return new InputStreamReader(stream, StandardCharsets.UTF_8); + public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException { + return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8); } @Override - long getID(BlackboardArtifact source) { + public long getID(BlackboardArtifact source) { return source.getArtifactID(); } @Override - String getName(BlackboardArtifact source) { + public String getName(BlackboardArtifact source) { return source.getDisplayName() + "_" + source.getArtifactID(); } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java index cf268dc1e4..55838f4e7f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java @@ -18,7 +18,6 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import java.io.InputStream; import java.io.Reader; import java.util.Arrays; import java.util.List; @@ -28,7 +27,7 @@ import org.sleuthkit.datamodel.AbstractFile; * Common methods for utilities that extract text and content and divide into * chunks */ -abstract class FileTextExtractor extends TextExtractor< AbstractFile> { +abstract class FileTextExtractor implements TextExtractor< AbstractFile> { static final List BLOB_MIME_TYPES @@ -96,17 +95,16 @@ abstract class FileTextExtractor extends TextExtractor< AbstractFile> { abstract boolean isSupported(AbstractFile file, String detectedFormat); @Override - abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException; + public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException; @Override - long getID(AbstractFile source) { + public long getID(AbstractFile source) { return source.getId(); } @Override - String getName(AbstractFile source) { + public String getName(AbstractFile source) { return source.getName(); } - } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java index e42663d54a..f1eb0ea100 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java @@ -19,16 +19,17 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; -import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.util.Arrays; import java.util.List; +import java.util.logging.Level; import net.htmlparser.jericho.Attributes; import net.htmlparser.jericho.Renderer; import net.htmlparser.jericho.Source; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; +import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -37,6 +38,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream; */ class HtmlTextExtractor extends FileTextExtractor { + static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); private static final int MAX_SIZE = 50_000_000; //50MB static final List WEB_MIME_TYPES = Arrays.asList( @@ -61,7 +63,9 @@ class HtmlTextExtractor extends FileTextExtractor { } @Override - Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException { + public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException { + ReadContentInputStream stream = new ReadContentInputStream(sourceFile); + //Parse the stream with Jericho and put the results in a Reader try { StringBuilder scripts = new StringBuilder(); @@ -75,7 +79,7 @@ class HtmlTextExtractor extends FileTextExtractor { int numComments = 0; int numOthers = 0; - Source source = new Source(in); + Source source = new Source(stream); source.fullSequentialParse(); Renderer renderer = source.getRenderer(); renderer.setNewLine("\n"); @@ -158,12 +162,11 @@ class HtmlTextExtractor extends FileTextExtractor { } @Override - InputStream getInputStream(AbstractFile sourceFile1) { - return new ReadContentInputStream(sourceFile1); - } - - @Override - boolean isDisabled() { + public boolean isDisabled() { return false; } + + public void logWarning(final String msg, Exception ex) { + logger.log(Level.WARNING, msg, ex); //NON-NLS } + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 312bff73cc..d46cf91981 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -18,11 +18,13 @@ */ package org.sleuthkit.autopsy.keywordsearch; +import com.google.common.base.Utf8; import java.io.IOException; -import java.io.InputStream; import java.io.Reader; import java.util.HashMap; +import java.util.Iterator; import java.util.Map; +import java.util.NoSuchElementException; import java.util.logging.Level; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.common.SolrInputDocument; @@ -54,9 +56,6 @@ class Ingester { private final Server solrServer = KeywordSearch.getServer(); private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor(); private static Ingester instance; - private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars - private static final int SINGLE_READ_CHARS = 1024; - private static final int EXTRA_CHARS = 128; private Ingester() { } @@ -120,6 +119,136 @@ class Ingester { return item.accept(SOLR_FIELDS_VISITOR); } + /** + * Use the given TextExtractor to extract text from the given source. The + * text will be chunked and each chunk passed to Solr to add to the index. + * + * + * @param The type of the Appendix provider that provides + * additional text to append to the final chunk. + * @param A subclass of SleuthkitVisibleItem. + * @param extractor The TextExtractor that will be used to extract text from + * the given source. + * @param source The source from which text will be extracted, chunked, + * and indexed. + * @param context The ingest job context that can be used to cancel this + * process. + * + * @return True if this method executed normally. or False if there was an + * unexpected exception. //JMTODO: This policy needs to be reviewed. + * + * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException + */ + < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException { + final long sourceID = extractor.getID(source); + final String sourceName = extractor.getName(source); + + int numChunks = 0; //unknown until chunking is done + + if (extractor.isDisabled()) { + /* some Extrctors, notable the strings extractor, have options which + * can be configured such that no extraction should be done */ + return true; + } + + Map fields = getContentFields(source); + //Get a reader for the content of the given source + try (Reader reader = extractor.getReader(source);) { + Chunker chunker = new Chunker(reader); + + for (Chunk chunk : chunker) { + String chunkId = Server.getChunkIdString(sourceID, numChunks + 1); + fields.put(Server.Schema.ID.toString(), chunkId); + try { + //add the chunk text to Solr index + indexChunk(chunk.getText().toString(), sourceName, fields); + numChunks++; + } catch (Ingester.IngesterException ingEx) { + extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS + + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS + + throw ingEx; //need to rethrow to signal error and move on + } catch (Exception ex) { + throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); + } + } + } catch (IOException ex) { + extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS + return false; + } catch (Exception ex) { + extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS + return false; + } finally { + //after all chunks, index just the meta data, including the numChunks, of the parent file + fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); + fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id + indexChunk(null, sourceName, fields); + } + + return true; + } + + /** + * Add one chunk as to the Solr index as a seperate sold document. + * + * TODO see if can use a byte or string streaming way to add content to + * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr + * 4.0.0), see if possible to stream with UpdateRequestHandler + * + * @param chunk The chunk content as a string + * @param fields + * @param size + * + * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException + */ + private void indexChunk(String chunk, String sourceName, Map fields) throws IngesterException { + if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { + //JMTODO: actually if the we couldn't get the image id it is set to -1, + // but does this really mean we don't want to index it? + + //skip the file, image id unknown + //JMTODO: does this need to ne internationalized? + String msg = NbBundle.getMessage(Ingester.class, + "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized? + logger.log(Level.SEVERE, msg); + throw new IngesterException(msg); + } + + //Make a SolrInputDocument out of the field map + SolrInputDocument updateDoc = new SolrInputDocument(); + for (String key : fields.keySet()) { + updateDoc.addField(key, fields.get(key)); + } + //add the content to the SolrInputDocument + //JMTODO: can we just add it to the field map before passing that in? + updateDoc.addField(Server.Schema.CONTENT.toString(), chunk); + + try { + //TODO: consider timeout thread, or vary socket timeout based on size of indexed content + solrServer.addDocument(updateDoc); + uncommitedIngests = true; + + } catch (KeywordSearchModuleException ex) { + //JMTODO: does this need to ne internationalized? + throw new IngesterException( + NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex); + } + } + + /** + * Tells Solr to commit (necessary before ingested files will appear in + * searches) + */ + void commit() { + try { + solrServer.commit(); + uncommitedIngests = false; + } catch (NoOpenCoreException | SolrServerException ex) { + logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS + + } + } + /** * Visitor used to create fields to send to SOLR index. */ @@ -221,192 +350,6 @@ class Ingester { } } - /** - * Use the given TextExtractor to extract text from the given source. The - * text will be chunked and each chunk passed to Solr to add to the index. - * - * - * @param The type of the Appendix provider that provides - * additional text to append to the final chunk. - * @param A subclass of SleuthkitVisibleItem. - * @param extractor The TextExtractor that will be used to extract text from - * the given source. - * @param source The source from which text will be extracted, chunked, - * and indexed. - * @param context The ingest job context that can be used to cancel this - * process. - * - * @return True if this method executed normally. or False if there was an - * unexpected exception. //JMTODO: This policy needs to be reviewed. - * - * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException - */ - < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException { - final long sourceID = extractor.getID(source); - final String sourceName = extractor.getName(source); - - int numChunks = 0; //unknown until chunking is done - - if (extractor.isDisabled()) { - /* some Extrctors, notable the strings extractor, have options which - * can be configured such that no extraction should be done */ - return true; - } - - Map fields = getContentFields(source); - //Get a stream and a reader for that stream - try (final InputStream stream = extractor.getInputStream(source); - Reader reader = extractor.getReader(stream, source);) { - - //we read max 1024 chars at time, this seems to max what some Readers would return - char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; - - boolean eof = false; //have we read until the end of the file yet - while (!eof) { - int chunkSizeInChars = 0; // the size in chars of the chunk (so far) - if (context != null && context.fileIngestIsCancelled()) { - return true; - } - long charsRead = 0; // number of chars read in the most recent read operation - //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word) - while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) - && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) { - chunkSizeInChars += charsRead; - } - - if (charsRead == -1) { - //this is the last chunk - eof = true; - } else { - chunkSizeInChars += charsRead; - - //if we haven't reached the end of the file, - //try to read char-by-char until whitespace to not break words - while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1) - && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false) - && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) { - chunkSizeInChars += charsRead; - } - if (charsRead == -1) { - //this is the last chunk - eof = true; - } - } - - StringBuilder sb = new StringBuilder(chunkSizeInChars) - .append(textChunkBuf, 0, chunkSizeInChars); - - sanitizeToUTF8(sb); //replace non UTF8 chars with '^' - - String chunkId = Server.getChunkIdString(sourceID, numChunks + 1); - fields.put(Server.Schema.ID.toString(), chunkId); - try { - //pass the chunk to method that adds it to Solr index - indexChunk(sb.toString(), sourceName, fields); - numChunks++; - } catch (Ingester.IngesterException ingEx) { - extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS - + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS - - throw ingEx; //need to rethrow to signal error and move on - } catch (Exception ex) { - throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); - } - } - } catch (IOException ex) { - extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS - return false; - } catch (Exception ex) { - extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS - return false; - } finally { - //after all chunks, index just the meta data, including the numChunks, of the parent file - fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); - fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id - indexChunk(null, sourceName, fields); - } - return true; - } - - /** - * Sanitize the given StringBuilder by replacing non-UTF-8 characters with - * caret '^' - * - * @param sb the StringBuilder to sanitize - * - * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping - * function? - */ - private static void sanitizeToUTF8(StringBuilder sb) { - final int length = sb.length(); - - // Sanitize by replacing non-UTF-8 characters with caret '^' - for (int i = 0; i < length; i++) { - if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) { - sb.replace(i, i + 1, "^"); - } - } - } - - /** - * Add one chunk as to the Solr index as a seperate sold document. - * - * TODO see if can use a byte or string streaming way to add content to - * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr - * 4.0.0), see if possible to stream with UpdateRequestHandler - * - * @param chunk The chunk content as a string - * @param fields - * @param size - * - * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException - */ - private void indexChunk(String chunk, String sourceName, Map fields) throws IngesterException { - if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { - //JMTODO: actually if the we couldn't get the image id it is set to -1, - // but does this really mean we don't want to index it? - - //skip the file, image id unknown - //JMTODO: does this need to ne internationalized? - String msg = NbBundle.getMessage(Ingester.class, - "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized? - logger.log(Level.SEVERE, msg); - throw new IngesterException(msg); - } - - //Make a SolrInputDocument out of the field map - SolrInputDocument updateDoc = new SolrInputDocument(); - for (String key : fields.keySet()) { - updateDoc.addField(key, fields.get(key)); - } - //add the content to the SolrInputDocument - //JMTODO: can we just add it to the field map before passing that in? - updateDoc.addField(Server.Schema.CONTENT.toString(), chunk); - - try { - //TODO: consider timeout thread, or vary socket timeout based on size of indexed content - solrServer.addDocument(updateDoc); - uncommitedIngests = true; - } catch (KeywordSearchModuleException ex) { - //JMTODO: does this need to ne internationalized? - throw new IngesterException( - NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex); - } - } - - /** - * Tells Solr to commit (necessary before ingested files will appear in - * searches) - */ - void commit() { - try { - solrServer.commit(); - uncommitedIngests = false; - } catch (NoOpenCoreException | SolrServerException ex) { - logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS - } - } - /** * Indicates that there was an error with the specific ingest operation, but * it's still okay to continue ingesting files. @@ -423,4 +366,146 @@ class Ingester { super(message); } } + +} + +class Chunk { + private final StringBuilder sb; + private final int chunksize; + + Chunk(StringBuilder sb, int chunksize) { + this.sb = sb; + this.chunksize = chunksize; + } + + StringBuilder getText() { + return sb; + } + + int getSize() { + return chunksize; + } +} + +/** + * Encapsulates the content chunking algorithm in implementation of the Iterator + * interface. + */ +class Chunker implements Iterator, Iterable { + + private static final int INITIAL_CHUNK_SIZE = 32 * 1024; //bytes + private static final int SINGLE_READ_CHARS = 1024; + + private int chunkSizeBytes = 0; // the size in bytes of chunk (so far) + private int charsRead = 0; // number of chars read in the most recent read operation + private boolean whitespace = false; + private char[] tempChunkBuf; + private StringBuilder chunkText; + private boolean endOfContent = false; + private final Reader reader; + + /** + * Create a Chunker that will chunk the content of the given Reader. + * + * @param reader The content to chunk. + */ + Chunker(Reader reader) { + this.reader = reader; + } + + @Override + public Iterator iterator() { + return this; + } + + /** + * Are there any more chunks available from this chunker? + * + * + * @return true if there are more chunks available. + */ + @Override + public boolean hasNext() { + return endOfContent == false; + } + + @Override + public Chunk next() { + if (hasNext()) { + chunkText = new StringBuilder(); + tempChunkBuf = new char[SINGLE_READ_CHARS]; + chunkSizeBytes = 0; + //read chars up to initial chunk size + while (chunkSizeBytes < INITIAL_CHUNK_SIZE && endOfContent == false) { + try { + charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS); + } catch (IOException ex) { + throw new RuntimeException("IOException while attempting to read chunk.", ex); + } + if (-1 == charsRead) { + //this is the last chunk + endOfContent = true; + } else { + String chunkSegment = new String(tempChunkBuf, 0, charsRead); + chunkSizeBytes += Utf8.encodedLength(chunkSegment); + chunkText.append(chunkSegment); + } + + } + if (false == endOfContent) { + endOfContent = readChunkUntilWhiteSpace(); + } + return new Chunk(sanitizeToUTF8(chunkText), chunkSizeBytes); + } else { + throw new NoSuchElementException("There are no more chunks."); + } + } + + + private boolean readChunkUntilWhiteSpace() { + charsRead = 0; + whitespace = false; + //if we haven't reached the end of the file, + //try to read char-by-char until whitespace to not break words + while ((chunkSizeBytes < INITIAL_CHUNK_SIZE) + && (false == whitespace)) { + try { + charsRead = reader.read(tempChunkBuf, 0, 1); + } catch (IOException ex) { + throw new RuntimeException("IOException while attempting to read chunk until whitespace.", ex); + } + if (-1 == charsRead) { + //this is the last chunk + return true; + } else { + whitespace = Character.isWhitespace(tempChunkBuf[0]); + String chunkSegment = new String(tempChunkBuf, 0, 1); + chunkSizeBytes += Utf8.encodedLength(chunkSegment); + chunkText.append(chunkSegment); + } + } + return false; + } + + /** + * Sanitize the given StringBuilder by replacing non-UTF-8 characters with + * caret '^' + * + * @param sb the StringBuilder to sanitize + * + * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping + * function? + */ + private static StringBuilder sanitizeToUTF8(StringBuilder sb) { + final int length = sb.length(); + + // Sanitize by replacing non-UTF-8 characters with caret '^' + for (int i = 0; i < length; i++) { + if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) { + sb.replace(i, i + 1, "^"); + + } + } + return sb; + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java index 5d675edfa0..489dd52af4 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java @@ -157,7 +157,7 @@ public class Server { public String toString() { return "num_chunks"; //NON-NLS } - }, + } }; public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java index fa6b8c0b48..7e2c40f1c4 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java @@ -19,21 +19,21 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; -import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.impl.HttpSolrClient; -import org.openide.util.NbBundle; import java.net.InetAddress; import java.util.List; import java.util.MissingResourceException; -import org.sleuthkit.autopsy.core.RuntimeProperties; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.openide.util.NbBundle; import org.openide.util.lookup.ServiceProvider; import org.openide.util.lookup.ServiceProviders; import org.sleuthkit.autopsy.casemodule.Case; +import org.sleuthkit.autopsy.core.RuntimeProperties; +import org.sleuthkit.autopsy.corecomponentinterfaces.AutopsyService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.TskCoreException; -import org.sleuthkit.autopsy.corecomponentinterfaces.AutopsyService; /** * An implementation of the KeywordSearchService interface that uses Solr for @@ -48,6 +48,7 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService private static final String BAD_IP_ADDRESS_FORMAT = "ioexception occurred when talking to server"; //NON-NLS private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS private static final int IS_REACHABLE_TIMEOUT_MS = 1000; + private static final String SERVICE_NAME = "Solr Keyword Search Service"; ArtifactTextExtractor extractor = new ArtifactTextExtractor(); @@ -210,4 +211,9 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService * Autopsy service providers may not have case-level resources. */ } + + @Override + public String getServiceName() { + return SERVICE_NAME; + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java index 01e4f2488f..21bd9bf09d 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; @@ -37,6 +38,8 @@ import org.sleuthkit.datamodel.TskException; */ class StringsTextExtractor extends FileTextExtractor { + static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); + /** * Options for this extractor */ @@ -92,7 +95,12 @@ class StringsTextExtractor extends FileTextExtractor { } @Override - boolean isDisabled() { + public void logWarning(final String msg, Exception ex) { + logger.log(Level.WARNING, msg, ex); //NON-NLS } + } + + @Override + public boolean isDisabled() { boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString())); boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString())); @@ -100,11 +108,11 @@ class StringsTextExtractor extends FileTextExtractor { } @Override - InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException { + public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException { + InputStream stringStream = getInputStream(sourceFile); return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET); } - @Override InputStream getInputStream(AbstractFile sourceFile) { //check which extract stream to use if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java index 1dcfd9c361..6ea27e733b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java @@ -18,10 +18,7 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import java.io.InputStream; import java.io.Reader; -import java.util.logging.Level; -import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.SleuthkitVisitableItem; /** @@ -31,9 +28,8 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem; * @param The subtype of SleuthkitVisitableItem an implementation * is able to process. */ -abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> { +interface TextExtractor< TextSource extends SleuthkitVisitableItem> { - static final private Logger logger = Logger.getLogger(TextExtractor.class.getName()); /** * Is this extractor configured such that no extraction will/should be done? @@ -48,18 +44,8 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> { * @param msg * @param ex */ - void logWarning(String msg, Exception ex) { - logger.log(Level.WARNING, msg, ex); //NON-NLS } - } + abstract void logWarning(String msg, Exception ex); - /** - * Get an input stream over the content of the given source. - * - * @param source - * - * @return - */ - abstract InputStream getInputStream(TextSource source); /** * Get a reader that over the text extracted from the given source. @@ -71,7 +57,7 @@ abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> { * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException; + abstract Reader getReader(TextSource source) throws Ingester.IngesterException; /** * Get the 'object' id of the given source. diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java index 8e23d0abe8..052cb5e2e1 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java @@ -20,7 +20,6 @@ package org.sleuthkit.autopsy.keywordsearch; import com.google.common.io.CharSource; import java.io.IOException; -import java.io.InputStream; import java.io.Reader; import java.util.List; import java.util.MissingResourceException; @@ -36,6 +35,7 @@ import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -46,6 +46,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream; */ class TikaTextExtractor extends FileTextExtractor { + static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName()); private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private static final List TIKA_SUPPORTED_TYPES @@ -55,13 +56,15 @@ class TikaTextExtractor extends FileTextExtractor { .collect(Collectors.toList()); @Override - void logWarning(final String msg, Exception ex) { + public void logWarning(final String msg, Exception ex) { KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex); - super.logWarning(msg, ex); + logger.log(Level.WARNING, msg, ex); //NON-NLS } } @Override - Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException { + public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException { + ReadContentInputStream stream = new ReadContentInputStream(sourceFile); + Metadata metadata = new Metadata(); //Parse the file in a task, a convenient way to have a timeout... final Future future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata)); @@ -117,13 +120,9 @@ class TikaTextExtractor extends FileTextExtractor { return TIKA_SUPPORTED_TYPES.contains(detectedFormat); } - @Override - InputStream getInputStream(AbstractFile sourceFile1) { - return new ReadContentInputStream(sourceFile1); - } @Override - boolean isDisabled() { + public boolean isDisabled() { return false; }