diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java deleted file mode 100644 index 5253e5e240..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2011-2016 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; - -import java.nio.charset.Charset; -import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; - -/** - * A representation of a chunk of text from a file that can be used, when - * supplied with an Ingester, to index the chunk for search. - */ -final class AbstractFileChunk { - - private final int chunkNumber; - private final TextExtractor textExtractor; - - /** - * Constructs a representation of a chunk of text from a file that can be - * used, when supplied with an Ingester, to index the chunk for search. - * - * @param textExtractor A TextExtractor for the file. - * @param chunkNumber A sequence number for the chunk. - */ - AbstractFileChunk(TextExtractor textExtractor, int chunkNumber) { - this.textExtractor = textExtractor; - this.chunkNumber = chunkNumber; - } - - /** - * Gets the TextExtractor for the source file of the text chunk. - * - * @return A reference to the TextExtractor. - */ - TextExtractor getTextExtractor() { - return textExtractor; - } - - /** - * Gets the sequence number of the text chunk. - * - * @return The chunk number. - */ - int getChunkNumber() { - return chunkNumber; - } - - /** - * Gets the id of the text chunk. - * - * @return An id of the form [source file object id]_[chunk number] - */ - String getChunkId() { - return Server.getChunkIdString(this.textExtractor.getSourceFile().getId(), this.chunkNumber); - } - - /** - * Indexes the text chunk. - * - * @param ingester An Ingester to do the indexing. - * @param chunkBytes The raw bytes of the text chunk. - * @param chunkSize The size of the text chunk in bytes. - * @param charSet The char set to use during indexing. - * - * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException - */ - void index(Ingester ingester, byte[] chunkBytes, long chunkSize, Charset charSet) throws IngesterException { - ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, textExtractor.getSourceFile(), charSet); - try { - ingester.ingest(this, bcs, chunkBytes.length); - } catch (Exception ex) { - throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", getChunkId()), ex); - } - } - -} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java deleted file mode 100644 index e8a7efdde0..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2011-2016 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.Charset; - -import org.openide.util.NbBundle; -import org.apache.solr.common.util.ContentStream; -import org.sleuthkit.datamodel.AbstractContent; -import org.sleuthkit.datamodel.AbstractFile; - -/** - * Wrapper over InputStream that implements ContentStream to feed to Solr. - */ -class AbstractFileStringContentStream implements ContentStream { - //input - - private final AbstractFile content; - private final Charset charset; - //converted - private final InputStream stream; - - public AbstractFileStringContentStream(AbstractFile content, Charset charset, InputStream inputStream) { - this.content = content; - this.charset = charset; - this.stream = inputStream; - } - - public AbstractContent getSourceContent() { - return content; - } - - @Override - public String getContentType() { - return "text/plain;charset=" + charset.name(); //NON-NLS - } - - @Override - public String getName() { - return content.getName(); - } - - @Override - public Reader getReader() throws IOException { - return new InputStreamReader(stream); - - } - - @Override - public Long getSize() { - //return convertedLength; - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSize.exception.msg")); - } - - @Override - public String getSourceInfo() { - return NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSrcInfo.text", content.getId()); - } - - @Override - public InputStream getStream() throws IOException { - return stream; - } - - @Override - protected void finalize() throws Throwable { - super.finalize(); - - stream.close(); - } -} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java deleted file mode 100644 index 7b6ba6458c..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringIntStream.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2012 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.Charset; -import java.util.List; -import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.coreutils.StringExtract; -import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult; -import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; -import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.TskCoreException; - -/** - * Wrapper over StringExtract to provide streaming API Given AbstractFile - * object, extract international strings from the file and read output as a - * stream of UTF-8 strings as encoded bytes. - * - */ -class AbstractFileStringIntStream extends InputStream { - - private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName()); - private static final int FILE_BUF_SIZE = 1024 * 1024; - private AbstractFile content; - private final byte[] oneCharBuf = new byte[1]; - private final StringExtract stringExtractor; - private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE]; - private long fileReadOffset = 0L; - private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user - private int convertBuffOffset = 0; //offset to start returning data to user on next read() - private int bytesInConvertBuff = 0; //amount of data currently in the buffer - private boolean fileEOF = false; //if file has more bytes to read - private boolean extractUTF8; - private boolean extractUTF16; - private Charset outCharset; - - private StringExtractResult lastExtractResult; - - /** - * Constructs new stream object that does conversion from file, to extracted - * strings, then to byte stream, for specified script, auto-detected - * encoding (UTF8, UTF16LE, UTF16BE), and specified output byte stream - * encoding - * - * @param content input content to process and turn into a stream to - * convert into strings - * @param scripts a list of scripts to consider - * @param extractUTF8 whether to extract utf8 encoding - * @param extractUTF16 whether to extract utf16 encoding - * @param outCharset encoding to use in the output byte stream - */ - public AbstractFileStringIntStream(AbstractFile content, List tags + scripts.append(tag.getElement().getContent()).append("\n"); + + } else if (tag.getName().equals("a")) { + //NON-NLS + numLinks++; + links.append(numLinks).append(") "); + links.append(tag.getTagContent()).append("\n"); + + } else if (tag.getName().equals("img")) { + //NON-NLS + numImages++; + images.append(numImages).append(") "); + images.append(tag.getTagContent()).append("\n"); + + } else if (tag.getTagType().equals(StartTagType.COMMENT)) { + numComments++; + comments.append(numComments).append(") "); + comments.append(tag.getTagContent()).append("\n"); + + } else { + // Make sure it has an attribute + Attributes atts = tag.getAttributes(); + if (atts != null && atts.length() > 0) { + numOthers++; + others.append(numOthers).append(") "); + others.append(tag.getName()).append(":"); + others.append(tag.getTagContent()).append("\n"); + + } + } + } + stringBuilder.append(text).append("\n\n"); + stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS + if (numScripts > 0) { + stringBuilder.append("---Scripts---\n"); //NON-NLS + stringBuilder.append(scripts).append("\n"); + } + if (numLinks > 0) { + stringBuilder.append("---Links---\n"); //NON-NLS + stringBuilder.append(links).append("\n"); + } + if (numImages > 0) { + stringBuilder.append("---Images---\n"); //NON-NLS + stringBuilder.append(images).append("\n"); + } + if (numComments > 0) { + stringBuilder.append("---Comments---\n"); //NON-NLS + stringBuilder.append(comments).append("\n"); + } + if (numOthers > 0) { + stringBuilder.append("---Others---\n"); //NON-NLS + stringBuilder.append(others).append("\n"); + } + // All done, now make it a reader + return new StringReader(stringBuilder.toString()); + } catch (IOException ex) { + throw new Ingester.IngesterException("Error extracting HTML from content.", ex); + } + } + + @Override + InputStream getInputStream(AbstractFile sourceFile1) { + return new ReadContentInputStream(sourceFile1); + } + + @Override + boolean isDisabled() { + return false; } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 0995535538..312bff73cc 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -18,49 +18,45 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; -import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import java.util.logging.Level; import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.SolrInputDocument; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.TextUtil; import org.sleuthkit.autopsy.datamodel.ContentUtils; -import org.sleuthkit.datamodel.AbstractContent; +import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.Content; -import org.sleuthkit.datamodel.ContentVisitor; +import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.DerivedFile; import org.sleuthkit.datamodel.Directory; import org.sleuthkit.datamodel.File; import org.sleuthkit.datamodel.LayoutFile; import org.sleuthkit.datamodel.LocalFile; -import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.SlackFile; +import org.sleuthkit.datamodel.SleuthkitItemVisitor; +import org.sleuthkit.datamodel.SleuthkitVisitableItem; import org.sleuthkit.datamodel.TskCoreException; /** * Handles indexing files on a Solr core. */ +//JMTODO: Should this class really be a singleton? class Ingester { private static final Logger logger = Logger.getLogger(Ingester.class.getName()); private volatile boolean uncommitedIngests = false; private final Server solrServer = KeywordSearch.getServer(); - private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV(); + private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor(); private static Ingester instance; - - //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika) - //TODO use a streaming way to add content to /update handler - private static final int MAX_DOC_CHUNK_SIZE = 32 * 1024; - private static final String ENCODING = "UTF-8"; //NON-NLS + private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars + private static final int SINGLE_READ_CHARS = 1024; + private static final int EXTRA_CHARS = 128; private Ingester() { } @@ -72,6 +68,7 @@ class Ingester { return instance; } + //JMTODO: this is probably useless @Override @SuppressWarnings("FinalizeDeclaration") protected void finalize() throws Throwable { @@ -84,123 +81,68 @@ class Ingester { } /** - * Sends a stream to Solr to have its content extracted and added to the - * index. commit() should be called once you're done ingesting files. + * Sends the metadata (name, MAC times, image id, etc) for the given file to + * Solr to be added to the index. commit() should be called once you're done + * indexing. * - * @param afscs File AbstractFileStringContentStream to ingest + * @param file File to index. * * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. */ - void ingest(AbstractFileStringContentStream afscs) throws IngesterException { - Map params = getContentFields(afscs.getSourceContent()); - ingest(afscs, params, afscs.getSourceContent().getSize()); + void indexMetaDataOnly(AbstractFile file) throws IngesterException { + indexChunk("", file.getName(), getContentFields(file)); } /** - * Sends a TextExtractor to Solr to have its content extracted and added to - * the index. commit() should be called once you're done ingesting files. - * FileExtract represents a parent of extracted file with actual content. - * The parent itself has no content, only meta data and is used to associate - * the extracted AbstractFileChunk + * Sends the metadata (artifact id, image id, etc) for the given artifact to + * Solr to be added to the index. commit() should be called once you're done + * indexing. * - * @param fe TextExtractor to ingest + * @param artifact The artifact to index. * * @throws IngesterException if there was an error processing a specific - * file, but the Solr server is probably fine. + * artifact, but the Solr server is probably fine. */ - void ingest(TextExtractor fe) throws IngesterException { - Map params = getContentFields(fe.getSourceFile()); - - params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks())); - - ingest(new NullContentStream(fe.getSourceFile()), params, 0); + void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException { + indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact)); } /** - * Sends a AbstractFileChunk to Solr and its extracted content stream to be - * added to the index. commit() should be called once you're done ingesting - * files. AbstractFileChunk represents a file chunk and its chunk content. + * Creates a field map from a SleuthkitVisitableItem, that is later sent to + * Solr. * - * @param fec AbstractFileChunk to ingest - * @param size approx. size of the stream in bytes, used for timeout - * estimation + * @param item SleuthkitVisitableItem to get fields from * - * @throws IngesterException if there was an error processing a specific - * file, but the Solr server is probably fine. + * @return the map from field name to value (as a string) */ - void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException { - AbstractContent sourceContent = bcs.getSourceContent(); - Map params = getContentFields(sourceContent); - - //overwrite id with the chunk id - params.put(Server.Schema.ID.toString(), - Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber())); - - ingest(bcs, params, size); + private Map getContentFields(SleuthkitVisitableItem item) { + return item.accept(SOLR_FIELDS_VISITOR); } /** - * Sends a file to Solr to have its content extracted and added to the - * index. commit() should be called once you're done ingesting files. If the - * file is a directory or ingestContent is set to false, the file name is - * indexed only. - * - * @param file File to ingest - * @param ingestContent if true, index the file and the content, otherwise - * indesx metadata only - * - * @throws IngesterException if there was an error processing a specific - * file, but the Solr server is probably fine. + * Visitor used to create fields to send to SOLR index. */ - void ingest(AbstractFile file, boolean ingestContent) throws IngesterException { - if (ingestContent == false || file.isDir()) { - ingest(new NullContentStream(file), getContentFields(file), 0); - } else { - ingest(new FscContentStream(file), getContentFields(file), file.getSize()); - } - } - - /** - * Creates a field map from FsContent, that is later sent to Solr - * - * @param fsc FsContent to get fields from - * - * @return the map - */ - private Map getContentFields(AbstractContent fsc) { - return fsc.accept(getContentFieldsV); - } - - /** - * Visitor used to create param list to send to SOLR index. - */ - private class GetContentFieldsV extends ContentVisitor.Default> { + static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default> { @Override - protected Map defaultVisit(Content cntnt) { + protected Map defaultVisit(SleuthkitVisitableItem svi) { return new HashMap<>(); } @Override public Map visit(File f) { - Map params = getCommonFields(f); - getCommonFileContentFields(params, f); - return params; + return getCommonAndMACTimeFields(f); } @Override public Map visit(DerivedFile df) { - Map params = getCommonFields(df); - getCommonFileContentFields(params, df); - return params; + return getCommonAndMACTimeFields(df); } @Override public Map visit(Directory d) { - Map params = getCommonFields(d); - getCommonFileContentFields(params, d); - return params; + return getCommonAndMACTimeFields(d); } @Override @@ -211,19 +153,25 @@ class Ingester { @Override public Map visit(LocalFile lf) { - Map params = getCommonFields(lf); - getCommonFileContentFields(params, lf); - return params; + return getCommonAndMACTimeFields(lf); } @Override public Map visit(SlackFile f) { - Map params = getCommonFields(f); - getCommonFileContentFields(params, f); - return params; + return getCommonAndMACTimeFields(f); } - private Map getCommonFileContentFields(Map params, AbstractFile file) { + /** + * Get the field map for AbstractFiles that includes MAC times and the + * fields that are common to all file classes. + * + * @param file The file to get fields for + * + * @return The field map, including MAC times and common fields, for the + * give file. + */ + private Map getCommonAndMACTimeFields(AbstractFile file) { + Map params = getCommonFields(file); params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file)); params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file)); params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file)); @@ -231,140 +179,219 @@ class Ingester { return params; } + /** + * Get the field map for AbstractFiles that is common to all file + * classes + * + * @param file The file to get fields for + * + * @return The field map of fields that are common to all file classes. + */ private Map getCommonFields(AbstractFile af) { Map params = new HashMap<>(); params.put(Server.Schema.ID.toString(), Long.toString(af.getId())); try { - long dataSourceId = af.getDataSource().getId(); - params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId)); + params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(af.getDataSource().getId())); } catch (TskCoreException ex) { - logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS + logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId(), ex); //NON-NLS params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1)); } - params.put(Server.Schema.FILE_NAME.toString(), af.getName()); return params; } + + /** + * Get the field map for artifacts. + * + * @param artifact The artifact to get fields for. + * + * @return The field map for the given artifact. + */ + @Override + public Map visit(BlackboardArtifact artifact) { + Map params = new HashMap<>(); + params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID())); + try { + params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId())); + } catch (TskCoreException ex) { + logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS + params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1)); + } + return params; + } } /** - * Indexing method that bypasses Tika, assumes pure text It reads and - * converts the entire content stream to string, assuming UTF8 since we - * can't use streaming approach for Solr /update handler. This should be - * safe, since all content is now in max 1MB chunks. + * Use the given TextExtractor to extract text from the given source. The + * text will be chunked and each chunk passed to Solr to add to the index. + * + * + * @param The type of the Appendix provider that provides + * additional text to append to the final chunk. + * @param A subclass of SleuthkitVisibleItem. + * @param extractor The TextExtractor that will be used to extract text from + * the given source. + * @param source The source from which text will be extracted, chunked, + * and indexed. + * @param context The ingest job context that can be used to cancel this + * process. + * + * @return True if this method executed normally. or False if there was an + * unexpected exception. //JMTODO: This policy needs to be reviewed. + * + * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException + */ + < T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException { + final long sourceID = extractor.getID(source); + final String sourceName = extractor.getName(source); + + int numChunks = 0; //unknown until chunking is done + + if (extractor.isDisabled()) { + /* some Extrctors, notable the strings extractor, have options which + * can be configured such that no extraction should be done */ + return true; + } + + Map fields = getContentFields(source); + //Get a stream and a reader for that stream + try (final InputStream stream = extractor.getInputStream(source); + Reader reader = extractor.getReader(stream, source);) { + + //we read max 1024 chars at time, this seems to max what some Readers would return + char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; + + boolean eof = false; //have we read until the end of the file yet + while (!eof) { + int chunkSizeInChars = 0; // the size in chars of the chunk (so far) + if (context != null && context.fileIngestIsCancelled()) { + return true; + } + long charsRead = 0; // number of chars read in the most recent read operation + //consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word) + while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) + && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) { + chunkSizeInChars += charsRead; + } + + if (charsRead == -1) { + //this is the last chunk + eof = true; + } else { + chunkSizeInChars += charsRead; + + //if we haven't reached the end of the file, + //try to read char-by-char until whitespace to not break words + while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1) + && (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false) + && (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) { + chunkSizeInChars += charsRead; + } + if (charsRead == -1) { + //this is the last chunk + eof = true; + } + } + + StringBuilder sb = new StringBuilder(chunkSizeInChars) + .append(textChunkBuf, 0, chunkSizeInChars); + + sanitizeToUTF8(sb); //replace non UTF8 chars with '^' + + String chunkId = Server.getChunkIdString(sourceID, numChunks + 1); + fields.put(Server.Schema.ID.toString(), chunkId); + try { + //pass the chunk to method that adds it to Solr index + indexChunk(sb.toString(), sourceName, fields); + numChunks++; + } catch (Ingester.IngesterException ingEx) { + extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS + + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS + + throw ingEx; //need to rethrow to signal error and move on + } catch (Exception ex) { + throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); + } + } + } catch (IOException ex) { + extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS + return false; + } catch (Exception ex) { + extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS + return false; + } finally { + //after all chunks, index just the meta data, including the numChunks, of the parent file + fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); + fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id + indexChunk(null, sourceName, fields); + } + return true; + } + + /** + * Sanitize the given StringBuilder by replacing non-UTF-8 characters with + * caret '^' + * + * @param sb the StringBuilder to sanitize + * + * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping + * function? + */ + private static void sanitizeToUTF8(StringBuilder sb) { + final int length = sb.length(); + + // Sanitize by replacing non-UTF-8 characters with caret '^' + for (int i = 0; i < length; i++) { + if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) { + sb.replace(i, i + 1, "^"); + } + } + } + + /** + * Add one chunk as to the Solr index as a seperate sold document. * * TODO see if can use a byte or string streaming way to add content to * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr * 4.0.0), see if possible to stream with UpdateRequestHandler * - * @param cs + * @param chunk The chunk content as a string * @param fields * @param size * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - void ingest(ContentStream cs, Map fields, final long size) throws IngesterException { + private void indexChunk(String chunk, String sourceName, Map fields) throws IngesterException { if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { + //JMTODO: actually if the we couldn't get the image id it is set to -1, + // but does this really mean we don't want to index it? + //skip the file, image id unknown - String msg = NbBundle.getMessage(this.getClass(), - "Ingester.ingest.exception.unknownImgId.msg", cs.getName()); + //JMTODO: does this need to ne internationalized? + String msg = NbBundle.getMessage(Ingester.class, + "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized? logger.log(Level.SEVERE, msg); throw new IngesterException(msg); } - final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE]; + //Make a SolrInputDocument out of the field map SolrInputDocument updateDoc = new SolrInputDocument(); - for (String key : fields.keySet()) { updateDoc.addField(key, fields.get(key)); } - - //using size here, but we are no longer ingesting entire files - //size is normally a chunk size, up to 1MB - if (size > 0) { - // TODO (RC): Use try with resources, adjust exception messages - InputStream is = null; - int read = 0; - try { - is = cs.getStream(); - read = is.read(docChunkContentBuf); - } catch (IOException ex) { - throw new IngesterException( - NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg", - cs.getName())); - } finally { - if (null != is) { - try { - is.close(); - } catch (IOException ex) { - logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS - } - } - } - - if (read != 0) { - String s = ""; - try { - s = new String(docChunkContentBuf, 0, read, ENCODING); - // Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index - char[] chars = null; - for (int i = 0; i < s.length(); i++) { - if (!TextUtil.isValidSolrUTF8(s.charAt(i))) { - // only convert string to char[] if there is a non-UTF8 character - if (chars == null) { - chars = s.toCharArray(); - } - chars[i] = '^'; - } - } - // check if the string was modified (i.e. there was a non-UTF8 character found) - if (chars != null) { - s = new String(chars); - } - } catch (UnsupportedEncodingException ex) { - logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS - } - updateDoc.addField(Server.Schema.CONTENT.toString(), s); - } else { - updateDoc.addField(Server.Schema.CONTENT.toString(), ""); - } - } else { - //no content, such as case when 0th chunk indexed - updateDoc.addField(Server.Schema.CONTENT.toString(), ""); - } + //add the content to the SolrInputDocument + //JMTODO: can we just add it to the field map before passing that in? + updateDoc.addField(Server.Schema.CONTENT.toString(), chunk); try { - //TODO consider timeout thread, or vary socket timeout based on size of indexed content + //TODO: consider timeout thread, or vary socket timeout based on size of indexed content solrServer.addDocument(updateDoc); uncommitedIngests = true; } catch (KeywordSearchModuleException ex) { + //JMTODO: does this need to ne internationalized? throw new IngesterException( - NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex); + NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex); } - - } - - /** - * return timeout that should be used to index the content - * - * @param size size of the content - * - * @return time in seconds to use a timeout - */ - static int getTimeout(long size) { - if (size < 1024 * 1024L) //1MB - { - return 60; - } else if (size < 10 * 1024 * 1024L) //10MB - { - return 1200; - } else if (size < 100 * 1024 * 1024L) //100MB - { - return 3600; - } else { - return 3 * 3600; - } - } /** @@ -380,92 +407,6 @@ class Ingester { } } - /** - * ContentStream to read() the data from a FsContent object - */ - private static class FscContentStream implements ContentStream { - - private AbstractFile f; - - FscContentStream(AbstractFile f) { - this.f = f; - } - - @Override - public String getName() { - return f.getName(); - } - - @Override - public String getSourceInfo() { - return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId()); - } - - @Override - public String getContentType() { - return null; - } - - @Override - public Long getSize() { - return f.getSize(); - } - - @Override - public InputStream getStream() throws IOException { - return new ReadContentInputStream(f); - } - - @Override - public Reader getReader() throws IOException { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader")); - } - } - - /** - * ContentStream associated with FsContent, but forced with no content - */ - private static class NullContentStream implements ContentStream { - - AbstractContent aContent; - - NullContentStream(AbstractContent aContent) { - this.aContent = aContent; - } - - @Override - public String getName() { - return aContent.getName(); - } - - @Override - public String getSourceInfo() { - return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId()); - } - - @Override - public String getContentType() { - return null; - } - - @Override - public Long getSize() { - return 0L; - } - - @Override - public InputStream getStream() throws IOException { - return new ByteArrayInputStream(new byte[0]); - } - - @Override - public Reader getReader() throws IOException { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader")); - } - } - /** * Indicates that there was an error with the specific ingest operation, but * it's still okay to continue ingesting files. diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java index 2a590e6862..9e58235318 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java @@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem private void reloadScriptsCheckBoxes() { boolean utf16 - = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); + = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); enableUTF16Checkbox.setSelected(utf16); boolean utf8 - = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); + = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); enableUTF8Checkbox.setSelected(utf8); final List