mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
build out ArtifactExtractor
This commit is contained in:
parent
1a70a4e8b2
commit
85af7c57b6
@ -6,10 +6,51 @@
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.openide.util.Exceptions;
|
||||
import org.sleuthkit.autopsy.casemodule.Case;
|
||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
|
||||
public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
|
||||
|
||||
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
|
||||
Content dataSource;
|
||||
Case currentCase;
|
||||
try {
|
||||
currentCase = Case.getCurrentCase();
|
||||
} catch (IllegalStateException ignore) {
|
||||
// thorown by Case.getCurrentCase() if currentCase is null
|
||||
return null;
|
||||
}
|
||||
|
||||
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
|
||||
if (sleuthkitCase == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
|
||||
if (abstractFile != null) {
|
||||
|
||||
dataSource = abstractFile.getDataSource();
|
||||
} else {
|
||||
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
|
||||
}
|
||||
|
||||
if (dataSource == null) {
|
||||
return null;
|
||||
}
|
||||
return dataSource;
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean noExtractionOptionsAreEnabled() {
|
||||
@ -27,13 +68,99 @@ public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(BlackboardArtifact source) {
|
||||
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
||||
InputStream getInputStream(BlackboardArtifact artifact) {
|
||||
|
||||
// Concatenate the string values of all attributes into a single
|
||||
// "content" string to be indexed.
|
||||
StringBuilder artifactContents = new StringBuilder();
|
||||
Content dataSource;
|
||||
try {
|
||||
dataSource = getDataSource(artifact);
|
||||
if (dataSource == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||
artifactContents.append(" : ");
|
||||
|
||||
// This is ugly since it will need to updated any time a new
|
||||
// TSK_DATETIME_* attribute is added. A slightly less ugly
|
||||
// alternative would be to assume that all date time attributes
|
||||
// will have a name of the form "TSK_DATETIME*" and check
|
||||
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
|
||||
// The major problem with that approach is that it would require
|
||||
// a round trip to the database to get the type name string.
|
||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||
// to magically format datetime attributes but that is complicated by
|
||||
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
||||
// while the utility to determine the timezone to use is in ContentUtils
|
||||
// in the Autopsy datamodel.
|
||||
if (attribute.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
|
||||
|
||||
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
||||
} else {
|
||||
artifactContents.append(attribute.getDisplayString());
|
||||
}
|
||||
artifactContents.append(System.lineSeparator());
|
||||
}
|
||||
} catch (TskCoreException ex) {
|
||||
Exceptions.printStackTrace(ex);
|
||||
return null;
|
||||
}
|
||||
if (artifactContents.length() == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// To play by the rules of the existing text markup implementations,
|
||||
// we need to (a) index the artifact contents in a "chunk" and
|
||||
// (b) create a separate index entry for the base artifact.
|
||||
// We distinguish artifact content from file content by applying a
|
||||
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
|
||||
// First, create an index entry for the base artifact.
|
||||
HashMap<String, String> solrFields = new HashMap<>();
|
||||
String documentId = Long.toString(artifact.getArtifactID());
|
||||
|
||||
solrFields.put(Server.Schema.ID.toString(), documentId);
|
||||
|
||||
// Set the IMAGE_ID field.
|
||||
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
|
||||
|
||||
// Next create the index entry for the document content.
|
||||
// The content gets added to a single chunk. We may need to add chunking
|
||||
// support later.
|
||||
long chunkId = 1;
|
||||
|
||||
documentId += "_" + Long.toString(chunkId);
|
||||
solrFields.replace(Server.Schema.ID.toString(), documentId);
|
||||
|
||||
return IOUtils.toInputStream(artifactContents);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
|
||||
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
||||
return new InputStreamReader(stream);
|
||||
}
|
||||
|
||||
@Override
|
||||
long getID(BlackboardArtifact source) {
|
||||
return source.getArtifactID();
|
||||
}
|
||||
|
||||
@Override
|
||||
ContentStream getContentStream(byte[] encodedBytes, int length, BlackboardArtifact source) {
|
||||
return new ByteArtifactStream(encodedBytes, length, source);
|
||||
}
|
||||
|
||||
@Override
|
||||
ContentStream getNullStream(BlackboardArtifact source) {
|
||||
return new Ingester.NullArtifactStream(source);
|
||||
}
|
||||
|
||||
@Override
|
||||
String getName(BlackboardArtifact source) {
|
||||
return source.getDisplayName();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import static org.sleuthkit.autopsy.keywordsearch.Bundle.*;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
|
||||
/**
|
||||
* Stream of bytes representing string with specified encoding to feed into Solr
|
||||
* as ContentStream
|
||||
*/
|
||||
class ByteArtifactStream implements ContentStream {
|
||||
|
||||
//input
|
||||
private final byte[] content; //extracted subcontent
|
||||
private long contentSize;
|
||||
private final BlackboardArtifact aContent; //origin
|
||||
|
||||
private final InputStream stream;
|
||||
|
||||
private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
|
||||
|
||||
public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
|
||||
this.content = content;
|
||||
this.aContent = aContent;
|
||||
stream = new ByteArrayInputStream(content, 0, (int) contentSize);
|
||||
}
|
||||
|
||||
public byte[] getByteContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
public BlackboardArtifact getSourceContent() {
|
||||
return aContent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType() {
|
||||
return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return aContent.getDisplayName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() throws IOException {
|
||||
return new InputStreamReader(stream);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getSize() {
|
||||
return contentSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
@NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
|
||||
public String getSourceInfo() {
|
||||
return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
return stream;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void finalize() throws Throwable {
|
||||
super.finalize();
|
||||
|
||||
stream.close();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
|
||||
/**
|
||||
* Common methods for utilities that extract text and content and divide into
|
||||
* chunks
|
||||
*/
|
||||
abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
|
||||
|
||||
/**
|
||||
* Common options that can be used by some extractors
|
||||
*/
|
||||
enum ExtractOptions {
|
||||
|
||||
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||
};
|
||||
|
||||
static final List<String> BLOB_MIME_TYPES
|
||||
= Arrays.asList(
|
||||
//ignore binary blob data, for which string extraction will be used
|
||||
"application/octet-stream", //NON-NLS
|
||||
"application/x-msdownload"); //NON-NLS
|
||||
|
||||
/** generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them */
|
||||
static final List<String> ARCHIVE_MIME_TYPES
|
||||
= Arrays.asList(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
"application/x-7z-compressed", //NON-NLS
|
||||
"application/x-ace-compressed", //NON-NLS
|
||||
"application/x-alz-compressed", //NON-NLS
|
||||
"application/x-arj", //NON-NLS
|
||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||
"application/x-cfs-compressed", //NON-NLS
|
||||
"application/x-dgc-compressed", //NON-NLS
|
||||
"application/x-apple-diskimage", //NON-NLS
|
||||
"application/x-gca-compressed", //NON-NLS
|
||||
"application/x-dar", //NON-NLS
|
||||
"application/x-lzx", //NON-NLS
|
||||
"application/x-lzh", //NON-NLS
|
||||
"application/x-rar-compressed", //NON-NLS
|
||||
"application/x-stuffit", //NON-NLS
|
||||
"application/x-stuffitx", //NON-NLS
|
||||
"application/x-gtar", //NON-NLS
|
||||
"application/x-archive", //NON-NLS
|
||||
"application/x-executable", //NON-NLS
|
||||
"application/x-gzip", //NON-NLS
|
||||
"application/zip", //NON-NLS
|
||||
"application/x-zoo", //NON-NLS
|
||||
"application/x-cpio", //NON-NLS
|
||||
"application/x-shar", //NON-NLS
|
||||
"application/x-tar", //NON-NLS
|
||||
"application/x-bzip", //NON-NLS
|
||||
"application/x-bzip2", //NON-NLS
|
||||
"application/x-lzip", //NON-NLS
|
||||
"application/x-lzma", //NON-NLS
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
/**
|
||||
* Determines if the extractor works only for specified types is
|
||||
* supportedTypes() or whether is a generic content extractor (such as
|
||||
* string extractor)
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
abstract boolean isContentTypeSpecific();
|
||||
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor if
|
||||
* isContentTypeSpecific() returns true.
|
||||
*
|
||||
* @param file to test if its content should be supported
|
||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||
* or null if not detected
|
||||
*
|
||||
* @return true if the file content is supported, false otherwise
|
||||
*/
|
||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||
|
||||
@Override
|
||||
long getID(AbstractFile source) {
|
||||
return source.getId();
|
||||
}
|
||||
|
||||
@Override
|
||||
ContentStream getContentStream(byte[] encodedBytes, int length, AbstractFile source) {
|
||||
return new ByteContentStream(encodedBytes, length, source);
|
||||
}
|
||||
|
||||
@Override
|
||||
ContentStream getNullStream(AbstractFile source) {
|
||||
return new Ingester.NullContentStream(source);
|
||||
}
|
||||
|
||||
@Override
|
||||
String getName(AbstractFile source) {
|
||||
return source.getName();
|
||||
}
|
||||
}
|
@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
||||
* chunks are indexed with Solr.
|
||||
*/
|
||||
class HtmlTextExtractor extends TextExtractor<Void> {
|
||||
class HtmlTextExtractor extends FileTextExtractor<Void> {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||
|
||||
|
@ -36,14 +36,16 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.datamodel.AbstractContent;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.ContentVisitor;
|
||||
import org.sleuthkit.datamodel.DerivedFile;
|
||||
import org.sleuthkit.datamodel.Directory;
|
||||
import org.sleuthkit.datamodel.File;
|
||||
import org.sleuthkit.datamodel.LayoutFile;
|
||||
import org.sleuthkit.datamodel.LocalFile;
|
||||
import org.sleuthkit.datamodel.SlackFile;
|
||||
import org.sleuthkit.datamodel.SleuthkitItemVisitor;
|
||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
@ -99,6 +101,11 @@ class Ingester {
|
||||
indexContentStream(new NullContentStream(file), getContentFields(file), 0);
|
||||
}
|
||||
|
||||
void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
|
||||
|
||||
// indexContentStream(new NullContentStream(artifact), getContentFields(file), 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a TextExtractor to Solr to have its content extracted and added to
|
||||
* the index. commit() should be called once you're done ingesting files.
|
||||
@ -117,6 +124,12 @@ class Ingester {
|
||||
indexContentStream(new NullContentStream(file), params, 0);
|
||||
}
|
||||
|
||||
private void recordNumberOfChunks(BlackboardArtifact artifact, int numChunks) throws IngesterException {
|
||||
Map<String, String> params = getContentFields(artifact);
|
||||
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||
indexContentStream(new NullArtifactStream(artifact), params, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a field map from FsContent, that is later sent to Solr
|
||||
*
|
||||
@ -124,19 +137,14 @@ class Ingester {
|
||||
*
|
||||
* @return the map
|
||||
*/
|
||||
Map<String, String> getContentFields(AbstractContent fsc) {
|
||||
Map<String, String> getContentFields(SleuthkitVisitableItem fsc) {
|
||||
return fsc.accept(getContentFieldsV);
|
||||
}
|
||||
|
||||
/**
|
||||
* Visitor used to create param list to send to SOLR index.
|
||||
*/
|
||||
static private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
|
||||
|
||||
@Override
|
||||
protected Map<String, String> defaultVisit(Content cntnt) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
static private class GetContentFieldsV extends SleuthkitItemVisitor.Default<Map<String, String>> {
|
||||
|
||||
@Override
|
||||
public Map<String, String> visit(File f) {
|
||||
@ -201,21 +209,46 @@ class Ingester {
|
||||
params.put(Server.Schema.FILE_NAME.toString(), af.getName());
|
||||
return params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> visit(BlackboardArtifact artifact) {
|
||||
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
|
||||
try {
|
||||
Content dataSource = ArtifactExtractor.getDataSource(artifact);
|
||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
|
||||
} catch (TskCoreException ex) {
|
||||
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS
|
||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
}
|
||||
|
||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
|
||||
private static final int SINGLE_READ_CHARS = 1024;
|
||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||
|
||||
public <T> boolean indexText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
|
||||
public <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
int numChunks = 0; //unknown until chunking is done
|
||||
|
||||
if (extractor.noExtractionOptionsAreEnabled()) {
|
||||
return true;
|
||||
}
|
||||
T appendix = extractor.newAppendixProvider();
|
||||
try (final InputStream stream = extractor.getInputStream(sourceFile);
|
||||
Reader reader = extractor.getReader(stream, sourceFile, appendix);) {
|
||||
final long sourceID = extractor.getID(source);
|
||||
final String sourceName = extractor.getName(source);
|
||||
Map<String, String> fields = getContentFields(source);
|
||||
|
||||
A appendix = extractor.newAppendixProvider();
|
||||
try (final InputStream stream = extractor.getInputStream(source);
|
||||
Reader reader = extractor.getReader(stream, source, appendix);) {
|
||||
|
||||
//we read max 1024 chars at time, this seems to max what this Reader would return
|
||||
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
@ -265,10 +298,10 @@ class Ingester {
|
||||
|
||||
//encode to bytes as UTF-8 to index as byte stream
|
||||
byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||
String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
|
||||
|
||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||
try {
|
||||
ByteContentStream bcs = new ByteContentStream(encodedBytes, encodedBytes.length, sourceFile);
|
||||
Map<String, String> fields = getContentFields(sourceFile);
|
||||
ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source);
|
||||
try {
|
||||
indexContentStream(bcs, fields, encodedBytes.length);
|
||||
} catch (Exception ex) {
|
||||
@ -277,20 +310,21 @@ class Ingester {
|
||||
numChunks++;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS
|
||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||
|
||||
throw ingEx; //need to rethrow to signal error and move on
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
extractor.logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
|
||||
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} catch (Exception ex) {
|
||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS
|
||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} finally {
|
||||
//after all chunks, ingest the parent file without content itself, and store numChunks
|
||||
recordNumberOfChunks(sourceFile, numChunks);
|
||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||
indexContentStream(extractor.getNullStream(source), fields, 0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -442,7 +476,7 @@ class Ingester {
|
||||
/**
|
||||
* ContentStream associated with FsContent, but forced with no content
|
||||
*/
|
||||
private static class NullContentStream implements ContentStream {
|
||||
static class NullContentStream implements ContentStream {
|
||||
|
||||
AbstractContent aContent;
|
||||
|
||||
@ -482,6 +516,50 @@ class Ingester {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ContentStream associated with Artifact, but forced with no content
|
||||
*/
|
||||
static class NullArtifactStream implements ContentStream {
|
||||
|
||||
BlackboardArtifact aContent;
|
||||
|
||||
NullArtifactStream(BlackboardArtifact aContent) {
|
||||
this.aContent = aContent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return aContent.getDisplayName();
|
||||
}
|
||||
|
||||
@NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n")
|
||||
@Override
|
||||
public String getSourceInfo() {
|
||||
return Bundle.Ingester_NullArtifactStream_getSrcInfo_text(aContent.getArtifactID());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getSize() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
return new ByteArrayInputStream(new byte[0]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() throws IOException {
|
||||
throw new UnsupportedOperationException(
|
||||
NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates that there was an error with the specific ingest operation, but
|
||||
* it's still okay to continue ingesting files.
|
||||
|
@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
||||
|
||||
private void reloadScriptsCheckBoxes() {
|
||||
boolean utf16
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
enableUTF16Checkbox.setSelected(utf16);
|
||||
|
||||
boolean utf8
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
enableUTF8Checkbox.setSelected(utf8);
|
||||
|
||||
final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
|
||||
@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
||||
reloadScriptsCheckBoxes();
|
||||
|
||||
boolean utf16
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
enableUTF16Checkbox.setSelected(utf16);
|
||||
|
||||
boolean utf8
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
enableUTF8Checkbox.setSelected(utf8);
|
||||
final boolean extractEnabled = utf16 || utf8;
|
||||
|
||||
@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
||||
|
||||
@Override
|
||||
public void store() {
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
|
||||
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
|
||||
Boolean.toString(enableUTF8Checkbox.isSelected()));
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
|
||||
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
|
||||
Boolean.toString(enableUTF16Checkbox.isSelected()));
|
||||
|
||||
if (toUpdate != null) {
|
||||
|
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
//accessed read-only by searcher thread
|
||||
|
||||
private boolean startedSearching = false;
|
||||
private List<TextExtractor<?>> textExtractors;
|
||||
private List<FileTextExtractor<?>> textExtractors;
|
||||
private StringsTextExtractor stringExtractor;
|
||||
private final KeywordSearchJobSettings settings;
|
||||
private boolean initialized = false;
|
||||
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
* @throws IngesterException exception thrown if indexing failed
|
||||
*/
|
||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||
TextExtractor extractor = null;
|
||||
FileTextExtractor extractor = null;
|
||||
|
||||
//go over available text extractors in order, and pick the first one (most specific one)
|
||||
for (TextExtractor fe : textExtractors) {
|
||||
for (FileTextExtractor fe : textExtractors) {
|
||||
if (fe.isSupported(aFile, detectedFormat)) {
|
||||
extractor = fe;
|
||||
break;
|
||||
@ -514,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
|
||||
// we skip archive formats that are opened by the archive module.
|
||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||
if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||
try {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return;
|
||||
|
@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
|
||||
}
|
||||
|
||||
private void displayEncodings() {
|
||||
String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
|
||||
String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
|
||||
String utf8 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
|
||||
String utf16 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
|
||||
ArrayList<String> encodingsList = new ArrayList<>();
|
||||
if (utf8 == null || Boolean.parseBoolean(utf8)) {
|
||||
encodingsList.add("UTF8");
|
||||
|
@ -211,14 +211,14 @@ class KeywordSearchSettings {
|
||||
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
|
||||
}
|
||||
//setting default Extract UTF8
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
|
||||
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
||||
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
||||
}
|
||||
//setting default Extract UTF16
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
|
||||
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
||||
KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
||||
}
|
||||
//setting default Latin-1 Script
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
|
||||
|
@ -20,22 +20,14 @@ package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.util.HashMap;
|
||||
import java.util.MissingResourceException;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||
import org.apache.solr.common.util.ContentStreamBase.StringStream;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.lookup.ServiceProvider;
|
||||
import org.sleuthkit.autopsy.casemodule.Case;
|
||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
|
||||
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
|
||||
private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
|
||||
|
||||
ArtifactExtractor extractor = new ArtifactExtractor();
|
||||
|
||||
@Override
|
||||
public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
|
||||
if (artifact == null) {
|
||||
@ -57,109 +51,18 @@ public class SolrSearchService implements KeywordSearchService {
|
||||
|
||||
// We only support artifact indexing for Autopsy versions that use
|
||||
// the negative range for artifact ids.
|
||||
long artifactId = artifact.getArtifactID();
|
||||
|
||||
if (artifactId > 0) {
|
||||
if (artifact.getArtifactID() > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
Case currentCase;
|
||||
try {
|
||||
currentCase = Case.getCurrentCase();
|
||||
} catch (IllegalStateException ignore) {
|
||||
// thorown by Case.getCurrentCase() if currentCase is null
|
||||
return;
|
||||
}
|
||||
|
||||
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
|
||||
if (sleuthkitCase == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
Content dataSource;
|
||||
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
|
||||
if (abstractFile != null) {
|
||||
dataSource = abstractFile.getDataSource();
|
||||
} else {
|
||||
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
|
||||
}
|
||||
|
||||
if (dataSource == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Concatenate the string values of all attributes into a single
|
||||
// "content" string to be indexed.
|
||||
StringBuilder artifactContents = new StringBuilder();
|
||||
|
||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||
artifactContents.append(" : ");
|
||||
|
||||
// This is ugly since it will need to updated any time a new
|
||||
// TSK_DATETIME_* attribute is added. A slightly less ugly
|
||||
// alternative would be to assume that all date time attributes
|
||||
// will have a name of the form "TSK_DATETIME*" and check
|
||||
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
|
||||
// The major problem with that approach is that it would require
|
||||
// a round trip to the database to get the type name string.
|
||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||
// to magically format datetime attributes but that is complicated by
|
||||
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
||||
// while the utility to determine the timezone to use is in ContentUtils
|
||||
// in the Autopsy datamodel.
|
||||
if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
|
||||
|
||||
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
||||
} else {
|
||||
artifactContents.append(attribute.getDisplayString());
|
||||
}
|
||||
artifactContents.append(System.lineSeparator());
|
||||
}
|
||||
|
||||
if (artifactContents.length() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// To play by the rules of the existing text markup implementations,
|
||||
// we need to (a) index the artifact contents in a "chunk" and
|
||||
// (b) create a separate index entry for the base artifact.
|
||||
// We distinguish artifact content from file content by applying a
|
||||
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
|
||||
// First, create an index entry for the base artifact.
|
||||
HashMap<String, String> solrFields = new HashMap<>();
|
||||
String documentId = Long.toString(artifactId);
|
||||
|
||||
solrFields.put(Server.Schema.ID.toString(), documentId);
|
||||
|
||||
// Set the IMAGE_ID field.
|
||||
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
|
||||
|
||||
try {
|
||||
Ingester.getDefault().indexContentStream(new StringStream(""), solrFields, 0);
|
||||
Ingester.getDefault().indexMetaDataOnly(artifact);
|
||||
} catch (Ingester.IngesterException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
|
||||
// Next create the index entry for the document content.
|
||||
// The content gets added to a single chunk. We may need to add chunking
|
||||
// support later.
|
||||
long chunkId = 1;
|
||||
|
||||
documentId += "_" + Long.toString(chunkId);
|
||||
solrFields.replace(Server.Schema.ID.toString(), documentId);
|
||||
|
||||
StringStream contentStream = new StringStream(artifactContents.toString());
|
||||
|
||||
try {
|
||||
Ingester.getDefault().indexContentStream(contentStream, solrFields, contentStream.getSize());
|
||||
Ingester.getDefault().indexText(extractor, artifact);
|
||||
} catch (Ingester.IngesterException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
|
@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskException;
|
||||
* with the original source file) up to 1MB then and indexes chunks as text with
|
||||
* Solr.
|
||||
*/
|
||||
class StringsTextExtractor extends TextExtractor<Void> {
|
||||
class StringsTextExtractor extends FileTextExtractor<Void> {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
||||
private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
||||
@ -94,8 +94,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
|
||||
|
||||
@Override
|
||||
boolean noExtractionOptionsAreEnabled() {
|
||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
return extractUTF8 == false && extractUTF16 == false;
|
||||
}
|
||||
@ -120,8 +120,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
|
||||
*/
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile) {
|
||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
//check which extract stream to use
|
||||
InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Copyright 2011-16 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -18,89 +18,30 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
|
||||
/**
|
||||
* Common methods for utilities that extract text and content and divide into
|
||||
* chunks
|
||||
*/
|
||||
abstract class TextExtractor<AppendixProvider> extends TextProvider<AppendixProvider, AbstractFile> {
|
||||
|
||||
/**
|
||||
* Common options that can be used by some extractors
|
||||
*/
|
||||
enum ExtractOptions {
|
||||
|
||||
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||
};
|
||||
|
||||
static final List<String> BLOB_MIME_TYPES
|
||||
= Arrays.asList(
|
||||
//ignore binary blob data, for which string extraction will be used
|
||||
"application/octet-stream", //NON-NLS
|
||||
"application/x-msdownload"); //NON-NLS
|
||||
|
||||
/** generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them */
|
||||
static final List<String> ARCHIVE_MIME_TYPES
|
||||
= Arrays.asList(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
"application/x-7z-compressed", //NON-NLS
|
||||
"application/x-ace-compressed", //NON-NLS
|
||||
"application/x-alz-compressed", //NON-NLS
|
||||
"application/x-arj", //NON-NLS
|
||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||
"application/x-cfs-compressed", //NON-NLS
|
||||
"application/x-dgc-compressed", //NON-NLS
|
||||
"application/x-apple-diskimage", //NON-NLS
|
||||
"application/x-gca-compressed", //NON-NLS
|
||||
"application/x-dar", //NON-NLS
|
||||
"application/x-lzx", //NON-NLS
|
||||
"application/x-lzh", //NON-NLS
|
||||
"application/x-rar-compressed", //NON-NLS
|
||||
"application/x-stuffit", //NON-NLS
|
||||
"application/x-stuffitx", //NON-NLS
|
||||
"application/x-gtar", //NON-NLS
|
||||
"application/x-archive", //NON-NLS
|
||||
"application/x-executable", //NON-NLS
|
||||
"application/x-gzip", //NON-NLS
|
||||
"application/zip", //NON-NLS
|
||||
"application/x-zoo", //NON-NLS
|
||||
"application/x-cpio", //NON-NLS
|
||||
"application/x-shar", //NON-NLS
|
||||
"application/x-tar", //NON-NLS
|
||||
"application/x-bzip", //NON-NLS
|
||||
"application/x-bzip2", //NON-NLS
|
||||
"application/x-lzip", //NON-NLS
|
||||
"application/x-lzma", //NON-NLS
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
/**
|
||||
* Determines if the extractor works only for specified types is
|
||||
* supportedTypes() or whether is a generic content extractor (such as
|
||||
* string extractor)
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
abstract boolean isContentTypeSpecific();
|
||||
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor if
|
||||
* isContentTypeSpecific() returns true.
|
||||
*
|
||||
* @param file to test if its content should be supported
|
||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||
* or null if not detected
|
||||
*
|
||||
* @return true if the file content is supported, false otherwise
|
||||
*/
|
||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||
abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
|
||||
|
||||
abstract boolean noExtractionOptionsAreEnabled();
|
||||
|
||||
abstract void logWarning(final String msg, Exception ex);
|
||||
|
||||
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
|
||||
//no-op
|
||||
}
|
||||
|
||||
abstract AppendixProvider newAppendixProvider();
|
||||
|
||||
abstract InputStream getInputStream(TextSource source);
|
||||
|
||||
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
|
||||
|
||||
abstract long getID(TextSource source);
|
||||
|
||||
abstract ContentStream getContentStream(byte[] encodedBytes, int length, TextSource source);
|
||||
abstract String getName(TextSource source);
|
||||
abstract ContentStream getNullStream(TextSource source);
|
||||
}
|
||||
|
@ -1,39 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-16 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
|
||||
abstract class TextProvider<AppendixProvider, TextSource> {
|
||||
|
||||
abstract boolean noExtractionOptionsAreEnabled();
|
||||
|
||||
abstract void logWarning(final String msg, Exception ex);
|
||||
|
||||
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
|
||||
//no-op
|
||||
}
|
||||
|
||||
abstract AppendixProvider newAppendixProvider();
|
||||
|
||||
abstract InputStream getInputStream(TextSource source);
|
||||
|
||||
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
|
||||
}
|
@ -49,7 +49,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
* parsers-supported content type.
|
||||
*
|
||||
*/
|
||||
class TikaTextExtractor extends TextExtractor<Metadata> {
|
||||
class TikaTextExtractor extends FileTextExtractor<Metadata> {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
|
||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
||||
@ -110,8 +110,8 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||
if (detectedFormat == null
|
||||
|| TextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||
|| TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||
|| FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||
|| FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||
|| detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
|
||||
|
||||
@ -123,6 +123,7 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
|
||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||
return new ReadContentInputStream(sourceFile1);
|
||||
}
|
||||
@ -131,4 +132,5 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
|
||||
boolean noExtractionOptionsAreEnabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user