mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
build out ArtifactExtractor
This commit is contained in:
parent
1a70a4e8b2
commit
85af7c57b6
@ -6,10 +6,51 @@
|
|||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.solr.common.util.ContentStream;
|
||||||
|
import org.openide.util.Exceptions;
|
||||||
|
import org.sleuthkit.autopsy.casemodule.Case;
|
||||||
|
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||||
|
import org.sleuthkit.datamodel.Content;
|
||||||
|
import org.sleuthkit.datamodel.SleuthkitCase;
|
||||||
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
|
public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
|
||||||
|
|
||||||
|
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
|
||||||
|
Content dataSource;
|
||||||
|
Case currentCase;
|
||||||
|
try {
|
||||||
|
currentCase = Case.getCurrentCase();
|
||||||
|
} catch (IllegalStateException ignore) {
|
||||||
|
// thorown by Case.getCurrentCase() if currentCase is null
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
|
||||||
|
if (sleuthkitCase == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
|
||||||
|
if (abstractFile != null) {
|
||||||
|
|
||||||
|
dataSource = abstractFile.getDataSource();
|
||||||
|
} else {
|
||||||
|
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dataSource == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return dataSource;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean noExtractionOptionsAreEnabled() {
|
boolean noExtractionOptionsAreEnabled() {
|
||||||
@ -27,13 +68,99 @@ public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
InputStream getInputStream(BlackboardArtifact source) {
|
InputStream getInputStream(BlackboardArtifact artifact) {
|
||||||
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
|
||||||
|
// Concatenate the string values of all attributes into a single
|
||||||
|
// "content" string to be indexed.
|
||||||
|
StringBuilder artifactContents = new StringBuilder();
|
||||||
|
Content dataSource;
|
||||||
|
try {
|
||||||
|
dataSource = getDataSource(artifact);
|
||||||
|
if (dataSource == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||||
|
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||||
|
artifactContents.append(" : ");
|
||||||
|
|
||||||
|
// This is ugly since it will need to updated any time a new
|
||||||
|
// TSK_DATETIME_* attribute is added. A slightly less ugly
|
||||||
|
// alternative would be to assume that all date time attributes
|
||||||
|
// will have a name of the form "TSK_DATETIME*" and check
|
||||||
|
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
|
||||||
|
// The major problem with that approach is that it would require
|
||||||
|
// a round trip to the database to get the type name string.
|
||||||
|
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||||
|
// to magically format datetime attributes but that is complicated by
|
||||||
|
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
||||||
|
// while the utility to determine the timezone to use is in ContentUtils
|
||||||
|
// in the Autopsy datamodel.
|
||||||
|
if (attribute.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
|
||||||
|
|
||||||
|
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
||||||
|
} else {
|
||||||
|
artifactContents.append(attribute.getDisplayString());
|
||||||
|
}
|
||||||
|
artifactContents.append(System.lineSeparator());
|
||||||
|
}
|
||||||
|
} catch (TskCoreException ex) {
|
||||||
|
Exceptions.printStackTrace(ex);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (artifactContents.length() == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// To play by the rules of the existing text markup implementations,
|
||||||
|
// we need to (a) index the artifact contents in a "chunk" and
|
||||||
|
// (b) create a separate index entry for the base artifact.
|
||||||
|
// We distinguish artifact content from file content by applying a
|
||||||
|
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
|
||||||
|
// First, create an index entry for the base artifact.
|
||||||
|
HashMap<String, String> solrFields = new HashMap<>();
|
||||||
|
String documentId = Long.toString(artifact.getArtifactID());
|
||||||
|
|
||||||
|
solrFields.put(Server.Schema.ID.toString(), documentId);
|
||||||
|
|
||||||
|
// Set the IMAGE_ID field.
|
||||||
|
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
|
||||||
|
|
||||||
|
// Next create the index entry for the document content.
|
||||||
|
// The content gets added to a single chunk. We may need to add chunking
|
||||||
|
// support later.
|
||||||
|
long chunkId = 1;
|
||||||
|
|
||||||
|
documentId += "_" + Long.toString(chunkId);
|
||||||
|
solrFields.replace(Server.Schema.ID.toString(), documentId);
|
||||||
|
|
||||||
|
return IOUtils.toInputStream(artifactContents);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
|
Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
|
||||||
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
return new InputStreamReader(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
long getID(BlackboardArtifact source) {
|
||||||
|
return source.getArtifactID();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
ContentStream getContentStream(byte[] encodedBytes, int length, BlackboardArtifact source) {
|
||||||
|
return new ByteArtifactStream(encodedBytes, length, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
ContentStream getNullStream(BlackboardArtifact source) {
|
||||||
|
return new Ingester.NullArtifactStream(source);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
String getName(BlackboardArtifact source) {
|
||||||
|
return source.getDisplayName();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,100 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2011 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import org.apache.solr.common.util.ContentStream;
|
||||||
|
import org.openide.util.NbBundle;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
|
import static org.sleuthkit.autopsy.keywordsearch.Bundle.*;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stream of bytes representing string with specified encoding to feed into Solr
|
||||||
|
* as ContentStream
|
||||||
|
*/
|
||||||
|
class ByteArtifactStream implements ContentStream {
|
||||||
|
|
||||||
|
//input
|
||||||
|
private final byte[] content; //extracted subcontent
|
||||||
|
private long contentSize;
|
||||||
|
private final BlackboardArtifact aContent; //origin
|
||||||
|
|
||||||
|
private final InputStream stream;
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
|
||||||
|
|
||||||
|
public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
|
||||||
|
this.content = content;
|
||||||
|
this.aContent = aContent;
|
||||||
|
stream = new ByteArrayInputStream(content, 0, (int) contentSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getByteContent() {
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BlackboardArtifact getSourceContent() {
|
||||||
|
return aContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getContentType() {
|
||||||
|
return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getName() {
|
||||||
|
return aContent.getDisplayName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader getReader() throws IOException {
|
||||||
|
return new InputStreamReader(stream);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Long getSize() {
|
||||||
|
return contentSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
|
||||||
|
public String getSourceInfo() {
|
||||||
|
return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InputStream getStream() throws IOException {
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void finalize() throws Throwable {
|
||||||
|
super.finalize();
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,124 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2011-2016 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.solr.common.util.ContentStream;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Common methods for utilities that extract text and content and divide into
|
||||||
|
* chunks
|
||||||
|
*/
|
||||||
|
abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Common options that can be used by some extractors
|
||||||
|
*/
|
||||||
|
enum ExtractOptions {
|
||||||
|
|
||||||
|
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||||
|
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||||
|
};
|
||||||
|
|
||||||
|
static final List<String> BLOB_MIME_TYPES
|
||||||
|
= Arrays.asList(
|
||||||
|
//ignore binary blob data, for which string extraction will be used
|
||||||
|
"application/octet-stream", //NON-NLS
|
||||||
|
"application/x-msdownload"); //NON-NLS
|
||||||
|
|
||||||
|
/** generally text extractors should ignore archives and let unpacking
|
||||||
|
* modules take care of them */
|
||||||
|
static final List<String> ARCHIVE_MIME_TYPES
|
||||||
|
= Arrays.asList(
|
||||||
|
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||||
|
"application/x-7z-compressed", //NON-NLS
|
||||||
|
"application/x-ace-compressed", //NON-NLS
|
||||||
|
"application/x-alz-compressed", //NON-NLS
|
||||||
|
"application/x-arj", //NON-NLS
|
||||||
|
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||||
|
"application/x-cfs-compressed", //NON-NLS
|
||||||
|
"application/x-dgc-compressed", //NON-NLS
|
||||||
|
"application/x-apple-diskimage", //NON-NLS
|
||||||
|
"application/x-gca-compressed", //NON-NLS
|
||||||
|
"application/x-dar", //NON-NLS
|
||||||
|
"application/x-lzx", //NON-NLS
|
||||||
|
"application/x-lzh", //NON-NLS
|
||||||
|
"application/x-rar-compressed", //NON-NLS
|
||||||
|
"application/x-stuffit", //NON-NLS
|
||||||
|
"application/x-stuffitx", //NON-NLS
|
||||||
|
"application/x-gtar", //NON-NLS
|
||||||
|
"application/x-archive", //NON-NLS
|
||||||
|
"application/x-executable", //NON-NLS
|
||||||
|
"application/x-gzip", //NON-NLS
|
||||||
|
"application/zip", //NON-NLS
|
||||||
|
"application/x-zoo", //NON-NLS
|
||||||
|
"application/x-cpio", //NON-NLS
|
||||||
|
"application/x-shar", //NON-NLS
|
||||||
|
"application/x-tar", //NON-NLS
|
||||||
|
"application/x-bzip", //NON-NLS
|
||||||
|
"application/x-bzip2", //NON-NLS
|
||||||
|
"application/x-lzip", //NON-NLS
|
||||||
|
"application/x-lzma", //NON-NLS
|
||||||
|
"application/x-lzop", //NON-NLS
|
||||||
|
"application/x-z", //NON-NLS
|
||||||
|
"application/x-compress"); //NON-NLS
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the extractor works only for specified types is
|
||||||
|
* supportedTypes() or whether is a generic content extractor (such as
|
||||||
|
* string extractor)
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
abstract boolean isContentTypeSpecific();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the file content is supported by the extractor if
|
||||||
|
* isContentTypeSpecific() returns true.
|
||||||
|
*
|
||||||
|
* @param file to test if its content should be supported
|
||||||
|
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||||
|
* or null if not detected
|
||||||
|
*
|
||||||
|
* @return true if the file content is supported, false otherwise
|
||||||
|
*/
|
||||||
|
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
long getID(AbstractFile source) {
|
||||||
|
return source.getId();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
ContentStream getContentStream(byte[] encodedBytes, int length, AbstractFile source) {
|
||||||
|
return new ByteContentStream(encodedBytes, length, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
ContentStream getNullStream(AbstractFile source) {
|
||||||
|
return new Ingester.NullContentStream(source);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
String getName(AbstractFile source) {
|
||||||
|
return source.getName();
|
||||||
|
}
|
||||||
|
}
|
@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
||||||
* chunks are indexed with Solr.
|
* chunks are indexed with Solr.
|
||||||
*/
|
*/
|
||||||
class HtmlTextExtractor extends TextExtractor<Void> {
|
class HtmlTextExtractor extends FileTextExtractor<Void> {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||||
|
|
||||||
|
@ -36,14 +36,16 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
|||||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||||
import org.sleuthkit.datamodel.AbstractContent;
|
import org.sleuthkit.datamodel.AbstractContent;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
import org.sleuthkit.datamodel.ContentVisitor;
|
|
||||||
import org.sleuthkit.datamodel.DerivedFile;
|
import org.sleuthkit.datamodel.DerivedFile;
|
||||||
import org.sleuthkit.datamodel.Directory;
|
import org.sleuthkit.datamodel.Directory;
|
||||||
import org.sleuthkit.datamodel.File;
|
import org.sleuthkit.datamodel.File;
|
||||||
import org.sleuthkit.datamodel.LayoutFile;
|
import org.sleuthkit.datamodel.LayoutFile;
|
||||||
import org.sleuthkit.datamodel.LocalFile;
|
import org.sleuthkit.datamodel.LocalFile;
|
||||||
import org.sleuthkit.datamodel.SlackFile;
|
import org.sleuthkit.datamodel.SlackFile;
|
||||||
|
import org.sleuthkit.datamodel.SleuthkitItemVisitor;
|
||||||
|
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -99,6 +101,11 @@ class Ingester {
|
|||||||
indexContentStream(new NullContentStream(file), getContentFields(file), 0);
|
indexContentStream(new NullContentStream(file), getContentFields(file), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
|
||||||
|
|
||||||
|
// indexContentStream(new NullContentStream(artifact), getContentFields(file), 0);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sends a TextExtractor to Solr to have its content extracted and added to
|
* Sends a TextExtractor to Solr to have its content extracted and added to
|
||||||
* the index. commit() should be called once you're done ingesting files.
|
* the index. commit() should be called once you're done ingesting files.
|
||||||
@ -117,6 +124,12 @@ class Ingester {
|
|||||||
indexContentStream(new NullContentStream(file), params, 0);
|
indexContentStream(new NullContentStream(file), params, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void recordNumberOfChunks(BlackboardArtifact artifact, int numChunks) throws IngesterException {
|
||||||
|
Map<String, String> params = getContentFields(artifact);
|
||||||
|
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||||
|
indexContentStream(new NullArtifactStream(artifact), params, 0);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a field map from FsContent, that is later sent to Solr
|
* Creates a field map from FsContent, that is later sent to Solr
|
||||||
*
|
*
|
||||||
@ -124,19 +137,14 @@ class Ingester {
|
|||||||
*
|
*
|
||||||
* @return the map
|
* @return the map
|
||||||
*/
|
*/
|
||||||
Map<String, String> getContentFields(AbstractContent fsc) {
|
Map<String, String> getContentFields(SleuthkitVisitableItem fsc) {
|
||||||
return fsc.accept(getContentFieldsV);
|
return fsc.accept(getContentFieldsV);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Visitor used to create param list to send to SOLR index.
|
* Visitor used to create param list to send to SOLR index.
|
||||||
*/
|
*/
|
||||||
static private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
|
static private class GetContentFieldsV extends SleuthkitItemVisitor.Default<Map<String, String>> {
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Map<String, String> defaultVisit(Content cntnt) {
|
|
||||||
return new HashMap<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, String> visit(File f) {
|
public Map<String, String> visit(File f) {
|
||||||
@ -201,21 +209,46 @@ class Ingester {
|
|||||||
params.put(Server.Schema.FILE_NAME.toString(), af.getName());
|
params.put(Server.Schema.FILE_NAME.toString(), af.getName());
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, String> visit(BlackboardArtifact artifact) {
|
||||||
|
|
||||||
|
Map<String, String> params = new HashMap<>();
|
||||||
|
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
|
||||||
|
try {
|
||||||
|
Content dataSource = ArtifactExtractor.getDataSource(artifact);
|
||||||
|
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
|
||||||
|
} catch (TskCoreException ex) {
|
||||||
|
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS
|
||||||
|
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
|
||||||
|
}
|
||||||
|
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
|
||||||
|
return new HashMap<>();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
|
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
|
||||||
private static final int SINGLE_READ_CHARS = 1024;
|
private static final int SINGLE_READ_CHARS = 1024;
|
||||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||||
|
|
||||||
public <T> boolean indexText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
|
public <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||||
int numChunks = 0; //unknown until chunking is done
|
int numChunks = 0; //unknown until chunking is done
|
||||||
|
|
||||||
if (extractor.noExtractionOptionsAreEnabled()) {
|
if (extractor.noExtractionOptionsAreEnabled()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
T appendix = extractor.newAppendixProvider();
|
final long sourceID = extractor.getID(source);
|
||||||
try (final InputStream stream = extractor.getInputStream(sourceFile);
|
final String sourceName = extractor.getName(source);
|
||||||
Reader reader = extractor.getReader(stream, sourceFile, appendix);) {
|
Map<String, String> fields = getContentFields(source);
|
||||||
|
|
||||||
|
A appendix = extractor.newAppendixProvider();
|
||||||
|
try (final InputStream stream = extractor.getInputStream(source);
|
||||||
|
Reader reader = extractor.getReader(stream, source, appendix);) {
|
||||||
|
|
||||||
//we read max 1024 chars at time, this seems to max what this Reader would return
|
//we read max 1024 chars at time, this seems to max what this Reader would return
|
||||||
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||||
@ -265,10 +298,10 @@ class Ingester {
|
|||||||
|
|
||||||
//encode to bytes as UTF-8 to index as byte stream
|
//encode to bytes as UTF-8 to index as byte stream
|
||||||
byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||||
String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
|
|
||||||
|
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||||
try {
|
try {
|
||||||
ByteContentStream bcs = new ByteContentStream(encodedBytes, encodedBytes.length, sourceFile);
|
ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source);
|
||||||
Map<String, String> fields = getContentFields(sourceFile);
|
|
||||||
try {
|
try {
|
||||||
indexContentStream(bcs, fields, encodedBytes.length);
|
indexContentStream(bcs, fields, encodedBytes.length);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
@ -277,20 +310,21 @@ class Ingester {
|
|||||||
numChunks++;
|
numChunks++;
|
||||||
} catch (Ingester.IngesterException ingEx) {
|
} catch (Ingester.IngesterException ingEx) {
|
||||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||||
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS
|
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||||
|
|
||||||
throw ingEx; //need to rethrow to signal error and move on
|
throw ingEx; //need to rethrow to signal error and move on
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
extractor.logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
|
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
return false;
|
return false;
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
|
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||||
return false;
|
return false;
|
||||||
} finally {
|
} finally {
|
||||||
//after all chunks, ingest the parent file without content itself, and store numChunks
|
//after all chunks, ingest the parent file without content itself, and store numChunks
|
||||||
recordNumberOfChunks(sourceFile, numChunks);
|
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||||
|
indexContentStream(extractor.getNullStream(source), fields, 0);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -442,7 +476,7 @@ class Ingester {
|
|||||||
/**
|
/**
|
||||||
* ContentStream associated with FsContent, but forced with no content
|
* ContentStream associated with FsContent, but forced with no content
|
||||||
*/
|
*/
|
||||||
private static class NullContentStream implements ContentStream {
|
static class NullContentStream implements ContentStream {
|
||||||
|
|
||||||
AbstractContent aContent;
|
AbstractContent aContent;
|
||||||
|
|
||||||
@ -482,6 +516,50 @@ class Ingester {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ContentStream associated with Artifact, but forced with no content
|
||||||
|
*/
|
||||||
|
static class NullArtifactStream implements ContentStream {
|
||||||
|
|
||||||
|
BlackboardArtifact aContent;
|
||||||
|
|
||||||
|
NullArtifactStream(BlackboardArtifact aContent) {
|
||||||
|
this.aContent = aContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getName() {
|
||||||
|
return aContent.getDisplayName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n")
|
||||||
|
@Override
|
||||||
|
public String getSourceInfo() {
|
||||||
|
return Bundle.Ingester_NullArtifactStream_getSrcInfo_text(aContent.getArtifactID());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getContentType() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Long getSize() {
|
||||||
|
return 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InputStream getStream() throws IOException {
|
||||||
|
return new ByteArrayInputStream(new byte[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader getReader() throws IOException {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Indicates that there was an error with the specific ingest operation, but
|
* Indicates that there was an error with the specific ingest operation, but
|
||||||
* it's still okay to continue ingesting files.
|
* it's still okay to continue ingesting files.
|
||||||
|
@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
|||||||
|
|
||||||
private void reloadScriptsCheckBoxes() {
|
private void reloadScriptsCheckBoxes() {
|
||||||
boolean utf16
|
boolean utf16
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
enableUTF16Checkbox.setSelected(utf16);
|
enableUTF16Checkbox.setSelected(utf16);
|
||||||
|
|
||||||
boolean utf8
|
boolean utf8
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
enableUTF8Checkbox.setSelected(utf8);
|
enableUTF8Checkbox.setSelected(utf8);
|
||||||
|
|
||||||
final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
|
final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
|
||||||
@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
|||||||
reloadScriptsCheckBoxes();
|
reloadScriptsCheckBoxes();
|
||||||
|
|
||||||
boolean utf16
|
boolean utf16
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
enableUTF16Checkbox.setSelected(utf16);
|
enableUTF16Checkbox.setSelected(utf16);
|
||||||
|
|
||||||
boolean utf8
|
boolean utf8
|
||||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
enableUTF8Checkbox.setSelected(utf8);
|
enableUTF8Checkbox.setSelected(utf8);
|
||||||
final boolean extractEnabled = utf16 || utf8;
|
final boolean extractEnabled = utf16 || utf8;
|
||||||
|
|
||||||
@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void store() {
|
public void store() {
|
||||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
|
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
|
||||||
Boolean.toString(enableUTF8Checkbox.isSelected()));
|
Boolean.toString(enableUTF8Checkbox.isSelected()));
|
||||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
|
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
|
||||||
Boolean.toString(enableUTF16Checkbox.isSelected()));
|
Boolean.toString(enableUTF16Checkbox.isSelected()));
|
||||||
|
|
||||||
if (toUpdate != null) {
|
if (toUpdate != null) {
|
||||||
|
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
//accessed read-only by searcher thread
|
//accessed read-only by searcher thread
|
||||||
|
|
||||||
private boolean startedSearching = false;
|
private boolean startedSearching = false;
|
||||||
private List<TextExtractor<?>> textExtractors;
|
private List<FileTextExtractor<?>> textExtractors;
|
||||||
private StringsTextExtractor stringExtractor;
|
private StringsTextExtractor stringExtractor;
|
||||||
private final KeywordSearchJobSettings settings;
|
private final KeywordSearchJobSettings settings;
|
||||||
private boolean initialized = false;
|
private boolean initialized = false;
|
||||||
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
* @throws IngesterException exception thrown if indexing failed
|
* @throws IngesterException exception thrown if indexing failed
|
||||||
*/
|
*/
|
||||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||||
TextExtractor extractor = null;
|
FileTextExtractor extractor = null;
|
||||||
|
|
||||||
//go over available text extractors in order, and pick the first one (most specific one)
|
//go over available text extractors in order, and pick the first one (most specific one)
|
||||||
for (TextExtractor fe : textExtractors) {
|
for (FileTextExtractor fe : textExtractors) {
|
||||||
if (fe.isSupported(aFile, detectedFormat)) {
|
if (fe.isSupported(aFile, detectedFormat)) {
|
||||||
extractor = fe;
|
extractor = fe;
|
||||||
break;
|
break;
|
||||||
@ -514,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
|
|
||||||
// we skip archive formats that are opened by the archive module.
|
// we skip archive formats that are opened by the archive module.
|
||||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||||
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||||
try {
|
try {
|
||||||
if (context.fileIngestIsCancelled()) {
|
if (context.fileIngestIsCancelled()) {
|
||||||
return;
|
return;
|
||||||
|
@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void displayEncodings() {
|
private void displayEncodings() {
|
||||||
String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
|
String utf8 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
|
||||||
String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
|
String utf16 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
|
||||||
ArrayList<String> encodingsList = new ArrayList<>();
|
ArrayList<String> encodingsList = new ArrayList<>();
|
||||||
if (utf8 == null || Boolean.parseBoolean(utf8)) {
|
if (utf8 == null || Boolean.parseBoolean(utf8)) {
|
||||||
encodingsList.add("UTF8");
|
encodingsList.add("UTF8");
|
||||||
|
@ -211,14 +211,14 @@ class KeywordSearchSettings {
|
|||||||
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
|
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
|
||||||
}
|
}
|
||||||
//setting default Extract UTF8
|
//setting default Extract UTF8
|
||||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
|
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
|
||||||
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
|
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
|
||||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
||||||
}
|
}
|
||||||
//setting default Extract UTF16
|
//setting default Extract UTF16
|
||||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
|
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
|
||||||
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
|
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
|
||||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
||||||
}
|
}
|
||||||
//setting default Latin-1 Script
|
//setting default Latin-1 Script
|
||||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
|
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
|
||||||
|
@ -20,22 +20,14 @@ package org.sleuthkit.autopsy.keywordsearch;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.MissingResourceException;
|
import java.util.MissingResourceException;
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||||
import org.apache.solr.common.util.ContentStreamBase.StringStream;
|
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.util.lookup.ServiceProvider;
|
import org.openide.util.lookup.ServiceProvider;
|
||||||
import org.sleuthkit.autopsy.casemodule.Case;
|
|
||||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
|
||||||
import org.sleuthkit.datamodel.Content;
|
|
||||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
|
|||||||
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
|
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
|
||||||
private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
|
private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
|
||||||
|
|
||||||
|
ArtifactExtractor extractor = new ArtifactExtractor();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
|
public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
|
||||||
if (artifact == null) {
|
if (artifact == null) {
|
||||||
@ -57,109 +51,18 @@ public class SolrSearchService implements KeywordSearchService {
|
|||||||
|
|
||||||
// We only support artifact indexing for Autopsy versions that use
|
// We only support artifact indexing for Autopsy versions that use
|
||||||
// the negative range for artifact ids.
|
// the negative range for artifact ids.
|
||||||
long artifactId = artifact.getArtifactID();
|
if (artifact.getArtifactID() > 0) {
|
||||||
|
|
||||||
if (artifactId > 0) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Case currentCase;
|
|
||||||
try {
|
|
||||||
currentCase = Case.getCurrentCase();
|
|
||||||
} catch (IllegalStateException ignore) {
|
|
||||||
// thorown by Case.getCurrentCase() if currentCase is null
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
|
|
||||||
if (sleuthkitCase == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
Content dataSource;
|
|
||||||
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
|
|
||||||
if (abstractFile != null) {
|
|
||||||
dataSource = abstractFile.getDataSource();
|
|
||||||
} else {
|
|
||||||
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dataSource == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Concatenate the string values of all attributes into a single
|
|
||||||
// "content" string to be indexed.
|
|
||||||
StringBuilder artifactContents = new StringBuilder();
|
|
||||||
|
|
||||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
|
||||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
|
||||||
artifactContents.append(" : ");
|
|
||||||
|
|
||||||
// This is ugly since it will need to updated any time a new
|
|
||||||
// TSK_DATETIME_* attribute is added. A slightly less ugly
|
|
||||||
// alternative would be to assume that all date time attributes
|
|
||||||
// will have a name of the form "TSK_DATETIME*" and check
|
|
||||||
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
|
|
||||||
// The major problem with that approach is that it would require
|
|
||||||
// a round trip to the database to get the type name string.
|
|
||||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
|
||||||
// to magically format datetime attributes but that is complicated by
|
|
||||||
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
|
||||||
// while the utility to determine the timezone to use is in ContentUtils
|
|
||||||
// in the Autopsy datamodel.
|
|
||||||
if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
|
|
||||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
|
|
||||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
|
|
||||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
|
|
||||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
|
|
||||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
|
|
||||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
|
|
||||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
|
|
||||||
|
|
||||||
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
|
||||||
} else {
|
|
||||||
artifactContents.append(attribute.getDisplayString());
|
|
||||||
}
|
|
||||||
artifactContents.append(System.lineSeparator());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (artifactContents.length() == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// To play by the rules of the existing text markup implementations,
|
|
||||||
// we need to (a) index the artifact contents in a "chunk" and
|
|
||||||
// (b) create a separate index entry for the base artifact.
|
|
||||||
// We distinguish artifact content from file content by applying a
|
|
||||||
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
|
|
||||||
// First, create an index entry for the base artifact.
|
|
||||||
HashMap<String, String> solrFields = new HashMap<>();
|
|
||||||
String documentId = Long.toString(artifactId);
|
|
||||||
|
|
||||||
solrFields.put(Server.Schema.ID.toString(), documentId);
|
|
||||||
|
|
||||||
// Set the IMAGE_ID field.
|
|
||||||
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Ingester.getDefault().indexContentStream(new StringStream(""), solrFields, 0);
|
Ingester.getDefault().indexMetaDataOnly(artifact);
|
||||||
} catch (Ingester.IngesterException ex) {
|
} catch (Ingester.IngesterException ex) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Next create the index entry for the document content.
|
|
||||||
// The content gets added to a single chunk. We may need to add chunking
|
|
||||||
// support later.
|
|
||||||
long chunkId = 1;
|
|
||||||
|
|
||||||
documentId += "_" + Long.toString(chunkId);
|
|
||||||
solrFields.replace(Server.Schema.ID.toString(), documentId);
|
|
||||||
|
|
||||||
StringStream contentStream = new StringStream(artifactContents.toString());
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Ingester.getDefault().indexContentStream(contentStream, solrFields, contentStream.getSize());
|
Ingester.getDefault().indexText(extractor, artifact);
|
||||||
} catch (Ingester.IngesterException ex) {
|
} catch (Ingester.IngesterException ex) {
|
||||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||||
}
|
}
|
||||||
|
@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskException;
|
|||||||
* with the original source file) up to 1MB then and indexes chunks as text with
|
* with the original source file) up to 1MB then and indexes chunks as text with
|
||||||
* Solr.
|
* Solr.
|
||||||
*/
|
*/
|
||||||
class StringsTextExtractor extends TextExtractor<Void> {
|
class StringsTextExtractor extends FileTextExtractor<Void> {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
||||||
private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
||||||
@ -94,8 +94,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean noExtractionOptionsAreEnabled() {
|
boolean noExtractionOptionsAreEnabled() {
|
||||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
return extractUTF8 == false && extractUTF16 == false;
|
return extractUTF8 == false && extractUTF16 == false;
|
||||||
}
|
}
|
||||||
@ -120,8 +120,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
InputStream getInputStream(AbstractFile sourceFile) {
|
InputStream getInputStream(AbstractFile sourceFile) {
|
||||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||||
|
|
||||||
//check which extract stream to use
|
//check which extract stream to use
|
||||||
InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)
|
InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Autopsy Forensic Browser
|
* Autopsy Forensic Browser
|
||||||
*
|
*
|
||||||
* Copyright 2011-2016 Basis Technology Corp.
|
* Copyright 2011-16 Basis Technology Corp.
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -18,89 +18,30 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.io.InputStream;
|
||||||
import java.util.List;
|
import java.io.Reader;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.apache.solr.common.util.ContentStream;
|
||||||
|
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||||
|
|
||||||
/**
|
abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
|
||||||
* Common methods for utilities that extract text and content and divide into
|
|
||||||
* chunks
|
|
||||||
*/
|
|
||||||
abstract class TextExtractor<AppendixProvider> extends TextProvider<AppendixProvider, AbstractFile> {
|
|
||||||
|
|
||||||
/**
|
abstract boolean noExtractionOptionsAreEnabled();
|
||||||
* Common options that can be used by some extractors
|
|
||||||
*/
|
|
||||||
enum ExtractOptions {
|
|
||||||
|
|
||||||
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
abstract void logWarning(final String msg, Exception ex);
|
||||||
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
|
||||||
};
|
|
||||||
|
|
||||||
static final List<String> BLOB_MIME_TYPES
|
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
|
||||||
= Arrays.asList(
|
//no-op
|
||||||
//ignore binary blob data, for which string extraction will be used
|
}
|
||||||
"application/octet-stream", //NON-NLS
|
|
||||||
"application/x-msdownload"); //NON-NLS
|
|
||||||
|
|
||||||
/** generally text extractors should ignore archives and let unpacking
|
abstract AppendixProvider newAppendixProvider();
|
||||||
* modules take care of them */
|
|
||||||
static final List<String> ARCHIVE_MIME_TYPES
|
|
||||||
= Arrays.asList(
|
|
||||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
|
||||||
"application/x-7z-compressed", //NON-NLS
|
|
||||||
"application/x-ace-compressed", //NON-NLS
|
|
||||||
"application/x-alz-compressed", //NON-NLS
|
|
||||||
"application/x-arj", //NON-NLS
|
|
||||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
|
||||||
"application/x-cfs-compressed", //NON-NLS
|
|
||||||
"application/x-dgc-compressed", //NON-NLS
|
|
||||||
"application/x-apple-diskimage", //NON-NLS
|
|
||||||
"application/x-gca-compressed", //NON-NLS
|
|
||||||
"application/x-dar", //NON-NLS
|
|
||||||
"application/x-lzx", //NON-NLS
|
|
||||||
"application/x-lzh", //NON-NLS
|
|
||||||
"application/x-rar-compressed", //NON-NLS
|
|
||||||
"application/x-stuffit", //NON-NLS
|
|
||||||
"application/x-stuffitx", //NON-NLS
|
|
||||||
"application/x-gtar", //NON-NLS
|
|
||||||
"application/x-archive", //NON-NLS
|
|
||||||
"application/x-executable", //NON-NLS
|
|
||||||
"application/x-gzip", //NON-NLS
|
|
||||||
"application/zip", //NON-NLS
|
|
||||||
"application/x-zoo", //NON-NLS
|
|
||||||
"application/x-cpio", //NON-NLS
|
|
||||||
"application/x-shar", //NON-NLS
|
|
||||||
"application/x-tar", //NON-NLS
|
|
||||||
"application/x-bzip", //NON-NLS
|
|
||||||
"application/x-bzip2", //NON-NLS
|
|
||||||
"application/x-lzip", //NON-NLS
|
|
||||||
"application/x-lzma", //NON-NLS
|
|
||||||
"application/x-lzop", //NON-NLS
|
|
||||||
"application/x-z", //NON-NLS
|
|
||||||
"application/x-compress"); //NON-NLS
|
|
||||||
|
|
||||||
/**
|
abstract InputStream getInputStream(TextSource source);
|
||||||
* Determines if the extractor works only for specified types is
|
|
||||||
* supportedTypes() or whether is a generic content extractor (such as
|
|
||||||
* string extractor)
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
abstract boolean isContentTypeSpecific();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if the file content is supported by the extractor if
|
|
||||||
* isContentTypeSpecific() returns true.
|
|
||||||
*
|
|
||||||
* @param file to test if its content should be supported
|
|
||||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
|
||||||
* or null if not detected
|
|
||||||
*
|
|
||||||
* @return true if the file content is supported, false otherwise
|
|
||||||
*/
|
|
||||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
|
||||||
|
|
||||||
|
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
|
||||||
|
|
||||||
|
abstract long getID(TextSource source);
|
||||||
|
|
||||||
|
abstract ContentStream getContentStream(byte[] encodedBytes, int length, TextSource source);
|
||||||
|
abstract String getName(TextSource source);
|
||||||
|
abstract ContentStream getNullStream(TextSource source);
|
||||||
}
|
}
|
||||||
|
@ -1,39 +0,0 @@
|
|||||||
/*
|
|
||||||
* Autopsy Forensic Browser
|
|
||||||
*
|
|
||||||
* Copyright 2011-16 Basis Technology Corp.
|
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
abstract class TextProvider<AppendixProvider, TextSource> {
|
|
||||||
|
|
||||||
abstract boolean noExtractionOptionsAreEnabled();
|
|
||||||
|
|
||||||
abstract void logWarning(final String msg, Exception ex);
|
|
||||||
|
|
||||||
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
|
|
||||||
//no-op
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract AppendixProvider newAppendixProvider();
|
|
||||||
|
|
||||||
abstract InputStream getInputStream(TextSource source);
|
|
||||||
|
|
||||||
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
|
|
||||||
}
|
|
@ -49,7 +49,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
|||||||
* parsers-supported content type.
|
* parsers-supported content type.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
class TikaTextExtractor extends TextExtractor<Metadata> {
|
class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
|
private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
|
||||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
||||||
@ -110,8 +110,8 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
|
|||||||
@Override
|
@Override
|
||||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||||
if (detectedFormat == null
|
if (detectedFormat == null
|
||||||
|| TextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
|| FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||||
|| TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
|| FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||||
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||||
|| detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
|
|| detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
|
||||||
|
|
||||||
@ -123,6 +123,7 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
|
|||||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||||
return new ReadContentInputStream(sourceFile1);
|
return new ReadContentInputStream(sourceFile1);
|
||||||
}
|
}
|
||||||
@ -131,4 +132,5 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
|
|||||||
boolean noExtractionOptionsAreEnabled() {
|
boolean noExtractionOptionsAreEnabled() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user