From c42f687bfbd37e94db632a5093757fab952c29f3 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Wed, 14 Dec 2016 15:27:55 +0100 Subject: [PATCH] more cleanup more cleanup --- ...ractor.java => ArtifactTextExtractor.java} | 9 +- .../autopsy/keywordsearch/Ingester.java | 159 ++++++++---------- .../keywordsearch/SolrSearchService.java | 14 +- .../keywordsearch/TikaTextExtractor.java | 24 ++- 4 files changed, 103 insertions(+), 103 deletions(-) rename KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/{ArtifactExtractor.java => ArtifactTextExtractor.java} (94%) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java similarity index 94% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java rename to KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java index 712d551cc5..0c1caeebe2 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java @@ -21,7 +21,6 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.util.logging.Level; import org.apache.commons.io.IOUtils; import org.openide.util.Exceptions; import org.sleuthkit.autopsy.casemodule.Case; @@ -34,8 +33,8 @@ import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.SleuthkitCase; import org.sleuthkit.datamodel.TskCoreException; -public class ArtifactExtractor extends TextExtractor { - static final private Logger logger = Logger.getLogger(ArtifactExtractor.class.getName()); +public class ArtifactTextExtractor extends TextExtractor { + static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName()); static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException { Content dataSource; @@ -71,10 +70,6 @@ public class ArtifactExtractor extends TextExtractor { return false; } - @Override - void logWarning(String msg, Exception ex) { - logger.log(Level.WARNING, msg, ex); //NON-NLS } - } @Override Void newAppendixProvider() { return null; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index b0e24d4570..566461c185 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -33,7 +33,6 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; -import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.DerivedFile; import org.sleuthkit.datamodel.Directory; import org.sleuthkit.datamodel.File; @@ -47,16 +46,17 @@ import org.sleuthkit.datamodel.TskCoreException; /** * Handles indexing files on a Solr core. */ +//JMTODO: Should this class really be a singleton? class Ingester { private static final Logger logger = Logger.getLogger(Ingester.class.getName()); private volatile boolean uncommitedIngests = false; private final Server solrServer = KeywordSearch.getServer(); - private static final GetContentFieldsV getContentFieldsV = new GetContentFieldsV(); + private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor(); private static Ingester instance; - //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika) - //TODO use a streaming way to add content to /update handler + private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars + private static final int SINGLE_READ_CHARS = 1024; private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024; private Ingester() { @@ -69,6 +69,7 @@ class Ingester { return instance; } + //JMTODO: this is probably useless @Override @SuppressWarnings("FinalizeDeclaration") protected void finalize() throws Throwable { @@ -81,14 +82,11 @@ class Ingester { } /** - * Sends a file to Solr to have its content extracted and added to the - * index. commit() should be called once you're done ingesting files. If the - * file is a directory or ingestContent is set to false, the file name is - * indexed only. + * Sends the metadata (name, MAC times, image id, etc) for the given file to + * Solr to be added to the index. commit() should be called once you're done + * indexing. * - * @param file File to ingest - * @param ingestContent if true, index the file and the content, otherwise - * index metadata only + * @param file File to index. * * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. @@ -97,25 +95,35 @@ class Ingester { indexChunk(null, file.getName(), getContentFields(file), 0); } + /** + * Sends the metadata (artifact id, image id, etc) for the given artifact to + * Solr to be added to the index. commit() should be called once you're done + * indexing. + * + * @param artifact The artifact to index. + * + * @throws IngesterException if there was an error processing a specific + * artifact, but the Solr server is probably fine. + */ void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException { - indexChunk(null, artifact.getDisplayName() + "_" + artifact.getArtifactID(), getContentFields(artifact), 0); + indexChunk(null, new ArtifactTextExtractor().getName(artifact), getContentFields(artifact), 0); } /** * Creates a field map from FsContent, that is later sent to Solr * - * @param fsc FsContent to get fields from + * @param item SleuthkitVisitableItem to get fields from * * @return the map */ - Map getContentFields(SleuthkitVisitableItem fsc) { - return fsc.accept(getContentFieldsV); + Map getContentFields(SleuthkitVisitableItem item) { + return item.accept(SOLR_FIELDS_VISITOR); } /** - * Visitor used to create param list to send to SOLR index. + * Visitor used to create fields to send to SOLR index. */ - static private class GetContentFieldsV extends SleuthkitItemVisitor.Default> { + static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default> { @Override protected Map defaultVisit(SleuthkitVisitableItem svi) { @@ -124,17 +132,17 @@ class Ingester { @Override public Map visit(File f) { - return getCommonFileContentFields(f); + return getFileFields(f); } @Override public Map visit(DerivedFile df) { - return getCommonFileContentFields(df); + return getFileFields(df); } @Override public Map visit(Directory d) { - return getCommonFileContentFields(d); + return getFileFields(d); } @Override @@ -145,15 +153,15 @@ class Ingester { @Override public Map visit(LocalFile lf) { - return getCommonFileContentFields(lf); + return getFileFields(lf); } @Override public Map visit(SlackFile f) { - return getCommonFileContentFields(f); + return getFileFields(f); } - private Map getCommonFileContentFields(AbstractFile file) { + private Map getFileFields(AbstractFile file) { Map params = getCommonFields(file); params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file)); params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file)); @@ -166,10 +174,9 @@ class Ingester { Map params = new HashMap<>(); params.put(Server.Schema.ID.toString(), Long.toString(af.getId())); try { - long dataSourceId = af.getDataSource().getId(); - params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId)); + params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(af.getDataSource().getId())); } catch (TskCoreException ex) { - logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS + logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId(), ex); //NON-NLS params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1)); } params.put(Server.Schema.FILE_NAME.toString(), af.getName()); @@ -181,29 +188,26 @@ class Ingester { Map params = new HashMap<>(); params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID())); try { - Content dataSource = ArtifactExtractor.getDataSource(artifact); - params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId())); + params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId())); } catch (TskCoreException ex) { - logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS + logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1)); } - return params; } } - private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars - private static final int SINGLE_READ_CHARS = 1024; - private static final int EXTRA_CHARS = 128; //for whitespace public boolean indexText(TextExtractor extractor, T source, IngestJobContext context) throws Ingester.IngesterException { + final long sourceID = extractor.getID(source); + final String sourceName = extractor.getName(source); + int numChunks = 0; //unknown until chunking is done if (extractor.noExtractionOptionsAreEnabled()) { return true; } - final long sourceID = extractor.getID(source); - final String sourceName = extractor.getName(source); + Map fields = getContentFields(source); A appendix = extractor.newAppendixProvider(); @@ -212,64 +216,64 @@ class Ingester { //we read max 1024 chars at time, this seems to max what this Reader would return char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; - long readSize; + boolean eof = false; while (!eof) { - int totalRead = 0; + int chunkSizeInChars = 0; if (context != null && context.fileIngestIsCancelled()) { return true; } - if ((readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) == -1) { - eof = true; - } else { - totalRead += readSize; + long charsRead = 0; + //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word) + while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) + && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) { + chunkSizeInChars += charsRead; } - //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word) - while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) - && (readSize = reader.read(textChunkBuf, totalRead, SINGLE_READ_CHARS)) != -1) { - totalRead += readSize; - } - if (readSize == -1) { + if (charsRead == -1) { //this is the last chunk eof = true; } else { + chunkSizeInChars += charsRead; + //try to read char-by-char until whitespace to not break words - while ((totalRead < MAX_EXTR_TEXT_CHARS - 1) - && !Character.isWhitespace(textChunkBuf[totalRead - 1]) - && (readSize = reader.read(textChunkBuf, totalRead, 1)) != -1) { - totalRead += readSize; + while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1) + && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false) + && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) { + chunkSizeInChars += charsRead; } - if (readSize == -1) { + if (charsRead == -1) { //this is the last chunk eof = true; } } - StringBuilder sb = new StringBuilder(totalRead + 1000) - .append(textChunkBuf, 0, totalRead); - + StringBuilder sb; if (eof) { + //1000 char buffer is to allow for appendix data with out needing to resize the string builder. + sb = new StringBuilder(chunkSizeInChars + 1000) + .append(textChunkBuf, 0, chunkSizeInChars); extractor.appendDataToFinalChunk(sb, appendix); + } else { + sb = new StringBuilder(chunkSizeInChars) + .append(textChunkBuf, 0, chunkSizeInChars); + } sanitizeToUTF8(sb); - final String chunkString = sb.toString(); String chunkId = Server.getChunkIdString(sourceID, numChunks + 1); fields.put(Server.Schema.ID.toString(), chunkId); try { - try { - indexChunk(chunkString, sourceName, fields, chunkString.length()); - } catch (Exception ex) { - throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); - } + indexChunk(sb.toString(), sourceName, fields, sb.length()); numChunks++; } catch (Ingester.IngesterException ingEx) { extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS throw ingEx; //need to rethrow to signal error and move on + } catch (Exception ex) { + throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); } } } catch (IOException ex) { @@ -325,15 +329,18 @@ class Ingester { */ void indexChunk(String chunk, String sourceName, Map fields, int size) throws IngesterException { if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { + //JMTODO: actually if the we couldn't get the image id it is set to -1, + // but does this really mean we don't want to index it? + //skip the file, image id unknown + //JMTODO: does this need to ne internationalized? String msg = NbBundle.getMessage(Ingester.class, - "Ingester.ingest.exception.unknownImgId.msg", sourceName); + "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized? logger.log(Level.SEVERE, msg); throw new IngesterException(msg); } SolrInputDocument updateDoc = new SolrInputDocument(); - for (String key : fields.keySet()) { updateDoc.addField(key, fields.get(key)); } @@ -343,38 +350,16 @@ class Ingester { updateDoc.addField(Server.Schema.CONTENT.toString(), (size > 0) ? chunk : ""); try { - //TODO consider timeout thread, or vary socket timeout based on size of indexed content + //TODO: consider timeout thread, or vary socket timeout based on size of indexed content solrServer.addDocument(updateDoc); uncommitedIngests = true; } catch (KeywordSearchModuleException ex) { + //JMTODO: does this need to ne internationalized? throw new IngesterException( NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex); } } - /** - * return timeout that should be used to index the content - * - * @param size size of the content - * - * @return time in seconds to use a timeout - */ - static int getTimeout(long size) { - if (size < 1024 * 1024L) //1MB - { - return 60; - } else if (size < 10 * 1024 * 1024L) //10MB - { - return 1200; - } else if (size < 100 * 1024 * 1024L) //100MB - { - return 3600; - } else { - return 3 * 3600; - } - - } - /** * Tells Solr to commit (necessary before ingested files will appear in * searches) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java index 4ecf65717a..233549caed 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrSearchService.java @@ -41,7 +41,7 @@ public class SolrSearchService implements KeywordSearchService { private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS private static final int IS_REACHABLE_TIMEOUT_MS = 1000; - ArtifactExtractor extractor = new ArtifactExtractor(); + ArtifactTextExtractor extractor = new ArtifactTextExtractor(); @Override public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException { @@ -54,17 +54,15 @@ public class SolrSearchService implements KeywordSearchService { if (artifact.getArtifactID() > 0) { return; } + final Ingester ingester = Ingester.getDefault(); try { - Ingester.getDefault().indexMetaDataOnly(artifact); - } catch (Ingester.IngesterException ex) { - throw new TskCoreException(ex.getCause().getMessage(), ex); - } - - try { - Ingester.getDefault().indexText(extractor, artifact, null); + ingester.indexMetaDataOnly(artifact); + ingester.indexText(extractor, artifact, null); } catch (Ingester.IngesterException ex) { throw new TskCoreException(ex.getCause().getMessage(), ex); + } finally { + ingester.commit(); } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java index db50ebef49..06d489363c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java @@ -74,6 +74,7 @@ class TikaTextExtractor extends FileTextExtractor { public void appendDataToFinalChunk(StringBuilder sb, Metadata meta) { //TODO: How do we account for this in chunking algorithm... + //JM: what if we always append it as a separate chunk? sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS Stream.of(meta.names()).sorted().forEach(key -> { sb.append(key).append(": ").append(meta.get(key)).append("\n"); @@ -85,7 +86,7 @@ class TikaTextExtractor extends FileTextExtractor { //Parse the file in a task final Future future = tikaParseExecutor.submit(() -> new Tika().parse(stream, meta)); try { - return future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); + return future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); } catch (TimeoutException te) { final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName()); logWarning(msg, te); @@ -129,5 +130,26 @@ class TikaTextExtractor extends FileTextExtractor { boolean noExtractionOptionsAreEnabled() { return false; } + /** + * return timeout that should be used to index the content + * + * @param size size of the content + * + * @return time in seconds to use a timeout + */ + static int getTimeout(long size) { + if (size < 1024 * 1024L) //1MB + { + return 60; + } else if (size < 10 * 1024 * 1024L) //10MB + { + return 1200; + } else if (size < 100 * 1024 * 1024L) //100MB + { + return 3600; + } else { + return 3 * 3600; + } + } }