diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java index f91ea5bca0..629c71936f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactExtractor.java @@ -6,10 +6,51 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.Reader; +import java.util.HashMap; +import org.apache.commons.io.IOUtils; +import org.apache.solr.common.util.ContentStream; +import org.openide.util.Exceptions; +import org.sleuthkit.autopsy.casemodule.Case; +import org.sleuthkit.autopsy.datamodel.ContentUtils; +import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; +import org.sleuthkit.datamodel.BlackboardAttribute; +import org.sleuthkit.datamodel.Content; +import org.sleuthkit.datamodel.SleuthkitCase; +import org.sleuthkit.datamodel.TskCoreException; -public class ArtifactExtractor extends TextProvider { +public class ArtifactExtractor extends TextExtractor { + + static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException { + Content dataSource; + Case currentCase; + try { + currentCase = Case.getCurrentCase(); + } catch (IllegalStateException ignore) { + // thorown by Case.getCurrentCase() if currentCase is null + return null; + } + + SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase(); + if (sleuthkitCase == null) { + return null; + } + + AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID()); + if (abstractFile != null) { + + dataSource = abstractFile.getDataSource(); + } else { + dataSource = sleuthkitCase.getContentById(artifact.getObjectID()); + } + + if (dataSource == null) { + return null; + } + return dataSource; + } @Override boolean noExtractionOptionsAreEnabled() { @@ -27,13 +68,99 @@ public class ArtifactExtractor extends TextProvider { } @Override - InputStream getInputStream(BlackboardArtifact source) { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + InputStream getInputStream(BlackboardArtifact artifact) { + + // Concatenate the string values of all attributes into a single + // "content" string to be indexed. + StringBuilder artifactContents = new StringBuilder(); + Content dataSource; + try { + dataSource = getDataSource(artifact); + if (dataSource == null) { + return null; + } + + for (BlackboardAttribute attribute : artifact.getAttributes()) { + artifactContents.append(attribute.getAttributeType().getDisplayName()); + artifactContents.append(" : "); + + // This is ugly since it will need to updated any time a new + // TSK_DATETIME_* attribute is added. A slightly less ugly + // alternative would be to assume that all date time attributes + // will have a name of the form "TSK_DATETIME*" and check + // attribute.getAttributeTypeName().startsWith("TSK_DATETIME*". + // The major problem with that approach is that it would require + // a round trip to the database to get the type name string. + // We have also discussed modifying BlackboardAttribute.getDisplayString() + // to magically format datetime attributes but that is complicated by + // the fact that BlackboardAttribute exists in Sleuthkit data model + // while the utility to determine the timezone to use is in ContentUtils + // in the Autopsy datamodel. + if (attribute.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) { + + artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource)); + } else { + artifactContents.append(attribute.getDisplayString()); + } + artifactContents.append(System.lineSeparator()); + } + } catch (TskCoreException ex) { + Exceptions.printStackTrace(ex); + return null; + } + if (artifactContents.length() == 0) { + return null; + } + + // To play by the rules of the existing text markup implementations, + // we need to (a) index the artifact contents in a "chunk" and + // (b) create a separate index entry for the base artifact. + // We distinguish artifact content from file content by applying a + // mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative). + // First, create an index entry for the base artifact. + HashMap solrFields = new HashMap<>(); + String documentId = Long.toString(artifact.getArtifactID()); + + solrFields.put(Server.Schema.ID.toString(), documentId); + + // Set the IMAGE_ID field. + solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId())); + + // Next create the index entry for the document content. + // The content gets added to a single chunk. We may need to add chunking + // support later. + long chunkId = 1; + + documentId += "_" + Long.toString(chunkId); + solrFields.replace(Server.Schema.ID.toString(), documentId); + + return IOUtils.toInputStream(artifactContents); + } @Override Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + return new InputStreamReader(stream); + } + + @Override + long getID(BlackboardArtifact source) { + return source.getArtifactID(); + } + + @Override + ContentStream getContentStream(byte[] encodedBytes, int length, BlackboardArtifact source) { + return new ByteArtifactStream(encodedBytes, length, source); + } + + @Override + ContentStream getNullStream(BlackboardArtifact source) { + return new Ingester.NullArtifactStream(source); + } + + @Override + String getName(BlackboardArtifact source) { + return source.getDisplayName(); } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java new file mode 100644 index 0000000000..7e4898d185 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteArtifactStream.java @@ -0,0 +1,100 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import org.apache.solr.common.util.ContentStream; +import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.coreutils.Logger; +import static org.sleuthkit.autopsy.keywordsearch.Bundle.*; +import org.sleuthkit.datamodel.BlackboardArtifact; + +/** + * Stream of bytes representing string with specified encoding to feed into Solr + * as ContentStream + */ +class ByteArtifactStream implements ContentStream { + + //input + private final byte[] content; //extracted subcontent + private long contentSize; + private final BlackboardArtifact aContent; //origin + + private final InputStream stream; + + private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName()); + + public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) { + this.content = content; + this.aContent = aContent; + stream = new ByteArrayInputStream(content, 0, (int) contentSize); + } + + public byte[] getByteContent() { + return content; + } + + public BlackboardArtifact getSourceContent() { + return aContent; + } + + @Override + public String getContentType() { + return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS + } + + @Override + public String getName() { + return aContent.getDisplayName(); + } + + @Override + public Reader getReader() throws IOException { + return new InputStreamReader(stream); + + } + + @Override + public Long getSize() { + return contentSize; + } + + @Override + @NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}") + public String getSourceInfo() { + return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID()); + } + + @Override + public InputStream getStream() throws IOException { + return stream; + } + + @Override + protected void finalize() throws Throwable { + super.finalize(); + + stream.close(); + } + +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java new file mode 100644 index 0000000000..e30ea764ea --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java @@ -0,0 +1,124 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2011-2016 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import java.util.Arrays; +import java.util.List; +import org.apache.solr.common.util.ContentStream; +import org.sleuthkit.datamodel.AbstractFile; + +/** + * Common methods for utilities that extract text and content and divide into + * chunks + */ +abstract class FileTextExtractor extends TextExtractor { + + /** + * Common options that can be used by some extractors + */ + enum ExtractOptions { + + EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString() + EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString() + }; + + static final List BLOB_MIME_TYPES + = Arrays.asList( + //ignore binary blob data, for which string extraction will be used + "application/octet-stream", //NON-NLS + "application/x-msdownload"); //NON-NLS + + /** generally text extractors should ignore archives and let unpacking + * modules take care of them */ + static final List ARCHIVE_MIME_TYPES + = Arrays.asList( + //ignore unstructured binary and compressed data, for which string extraction or unzipper works better + "application/x-7z-compressed", //NON-NLS + "application/x-ace-compressed", //NON-NLS + "application/x-alz-compressed", //NON-NLS + "application/x-arj", //NON-NLS + "application/vnd.ms-cab-compressed", //NON-NLS + "application/x-cfs-compressed", //NON-NLS + "application/x-dgc-compressed", //NON-NLS + "application/x-apple-diskimage", //NON-NLS + "application/x-gca-compressed", //NON-NLS + "application/x-dar", //NON-NLS + "application/x-lzx", //NON-NLS + "application/x-lzh", //NON-NLS + "application/x-rar-compressed", //NON-NLS + "application/x-stuffit", //NON-NLS + "application/x-stuffitx", //NON-NLS + "application/x-gtar", //NON-NLS + "application/x-archive", //NON-NLS + "application/x-executable", //NON-NLS + "application/x-gzip", //NON-NLS + "application/zip", //NON-NLS + "application/x-zoo", //NON-NLS + "application/x-cpio", //NON-NLS + "application/x-shar", //NON-NLS + "application/x-tar", //NON-NLS + "application/x-bzip", //NON-NLS + "application/x-bzip2", //NON-NLS + "application/x-lzip", //NON-NLS + "application/x-lzma", //NON-NLS + "application/x-lzop", //NON-NLS + "application/x-z", //NON-NLS + "application/x-compress"); //NON-NLS + + /** + * Determines if the extractor works only for specified types is + * supportedTypes() or whether is a generic content extractor (such as + * string extractor) + * + * @return + */ + abstract boolean isContentTypeSpecific(); + + /** + * Determines if the file content is supported by the extractor if + * isContentTypeSpecific() returns true. + * + * @param file to test if its content should be supported + * @param detectedFormat mime-type with detected format (such as text/plain) + * or null if not detected + * + * @return true if the file content is supported, false otherwise + */ + abstract boolean isSupported(AbstractFile file, String detectedFormat); + + @Override + long getID(AbstractFile source) { + return source.getId(); + } + + @Override + ContentStream getContentStream(byte[] encodedBytes, int length, AbstractFile source) { + return new ByteContentStream(encodedBytes, length, source); + } + + @Override + ContentStream getNullStream(AbstractFile source) { + return new Ingester.NullContentStream(source); + } + + @Override + String getName(AbstractFile source) { + return source.getName(); + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java index 07dde1c818..5387dd7619 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java @@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream; * divided into chunks and indexed with Solr. If HTML extraction succeeds, * chunks are indexed with Solr. */ -class HtmlTextExtractor extends TextExtractor { +class HtmlTextExtractor extends FileTextExtractor { private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 8a804569a1..d46eafa802 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -36,14 +36,16 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.datamodel.AbstractContent; import org.sleuthkit.datamodel.AbstractFile; +import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.Content; -import org.sleuthkit.datamodel.ContentVisitor; import org.sleuthkit.datamodel.DerivedFile; import org.sleuthkit.datamodel.Directory; import org.sleuthkit.datamodel.File; import org.sleuthkit.datamodel.LayoutFile; import org.sleuthkit.datamodel.LocalFile; import org.sleuthkit.datamodel.SlackFile; +import org.sleuthkit.datamodel.SleuthkitItemVisitor; +import org.sleuthkit.datamodel.SleuthkitVisitableItem; import org.sleuthkit.datamodel.TskCoreException; /** @@ -99,6 +101,11 @@ class Ingester { indexContentStream(new NullContentStream(file), getContentFields(file), 0); } + void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException { + +// indexContentStream(new NullContentStream(artifact), getContentFields(file), 0); + } + /** * Sends a TextExtractor to Solr to have its content extracted and added to * the index. commit() should be called once you're done ingesting files. @@ -117,6 +124,12 @@ class Ingester { indexContentStream(new NullContentStream(file), params, 0); } + private void recordNumberOfChunks(BlackboardArtifact artifact, int numChunks) throws IngesterException { + Map params = getContentFields(artifact); + params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); + indexContentStream(new NullArtifactStream(artifact), params, 0); + } + /** * Creates a field map from FsContent, that is later sent to Solr * @@ -124,19 +137,14 @@ class Ingester { * * @return the map */ - Map getContentFields(AbstractContent fsc) { + Map getContentFields(SleuthkitVisitableItem fsc) { return fsc.accept(getContentFieldsV); } /** * Visitor used to create param list to send to SOLR index. */ - static private class GetContentFieldsV extends ContentVisitor.Default> { - - @Override - protected Map defaultVisit(Content cntnt) { - return new HashMap<>(); - } + static private class GetContentFieldsV extends SleuthkitItemVisitor.Default> { @Override public Map visit(File f) { @@ -201,21 +209,46 @@ class Ingester { params.put(Server.Schema.FILE_NAME.toString(), af.getName()); return params; } + + @Override + public Map visit(BlackboardArtifact artifact) { + + Map params = new HashMap<>(); + params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID())); + try { + Content dataSource = ArtifactExtractor.getDataSource(artifact); + params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId())); + } catch (TskCoreException ex) { + logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS + params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1)); + } + + return params; + } + + @Override + protected Map defaultVisit(SleuthkitVisitableItem svi) { + return new HashMap<>(); + } } private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars private static final int SINGLE_READ_CHARS = 1024; private static final int EXTRA_CHARS = 128; //for whitespace - public boolean indexText(TextExtractor extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException { + public boolean indexText(TextExtractor extractor, T source, IngestJobContext context) throws Ingester.IngesterException { int numChunks = 0; //unknown until chunking is done if (extractor.noExtractionOptionsAreEnabled()) { return true; } - T appendix = extractor.newAppendixProvider(); - try (final InputStream stream = extractor.getInputStream(sourceFile); - Reader reader = extractor.getReader(stream, sourceFile, appendix);) { + final long sourceID = extractor.getID(source); + final String sourceName = extractor.getName(source); + Map fields = getContentFields(source); + + A appendix = extractor.newAppendixProvider(); + try (final InputStream stream = extractor.getInputStream(source); + Reader reader = extractor.getReader(stream, source, appendix);) { //we read max 1024 chars at time, this seems to max what this Reader would return char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; @@ -265,10 +298,10 @@ class Ingester { //encode to bytes as UTF-8 to index as byte stream byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET); - String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1); + + String chunkId = Server.getChunkIdString(sourceID, numChunks + 1); try { - ByteContentStream bcs = new ByteContentStream(encodedBytes, encodedBytes.length, sourceFile); - Map fields = getContentFields(sourceFile); + ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source); try { indexContentStream(bcs, fields, encodedBytes.length); } catch (Exception ex) { @@ -277,20 +310,21 @@ class Ingester { numChunks++; } catch (Ingester.IngesterException ingEx) { extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS - + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS + + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS throw ingEx; //need to rethrow to signal error and move on } } } catch (IOException ex) { - extractor.logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS + extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS return false; } catch (Exception ex) { - extractor.logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS + extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS return false; } finally { //after all chunks, ingest the parent file without content itself, and store numChunks - recordNumberOfChunks(sourceFile, numChunks); + fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); + indexContentStream(extractor.getNullStream(source), fields, 0); } return true; } @@ -442,7 +476,7 @@ class Ingester { /** * ContentStream associated with FsContent, but forced with no content */ - private static class NullContentStream implements ContentStream { + static class NullContentStream implements ContentStream { AbstractContent aContent; @@ -482,6 +516,50 @@ class Ingester { } } + /** + * ContentStream associated with Artifact, but forced with no content + */ + static class NullArtifactStream implements ContentStream { + + BlackboardArtifact aContent; + + NullArtifactStream(BlackboardArtifact aContent) { + this.aContent = aContent; + } + + @Override + public String getName() { + return aContent.getDisplayName(); + } + + @NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n") + @Override + public String getSourceInfo() { + return Bundle.Ingester_NullArtifactStream_getSrcInfo_text(aContent.getArtifactID()); + } + + @Override + public String getContentType() { + return null; + } + + @Override + public Long getSize() { + return 0L; + } + + @Override + public InputStream getStream() throws IOException { + return new ByteArrayInputStream(new byte[0]); + } + + @Override + public Reader getReader() throws IOException { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader")); + } + } + /** * Indicates that there was an error with the specific ingest operation, but * it's still okay to continue ingesting files. diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java index 2a590e6862..c1f1e2a5a7 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchGlobalLanguageSettingsPanel.java @@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem private void reloadScriptsCheckBoxes() { boolean utf16 - = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); + = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); enableUTF16Checkbox.setSelected(utf16); boolean utf8 - = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); + = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); enableUTF8Checkbox.setSelected(utf8); final List