build out ArtifactExtractor

This commit is contained in:
millmanorama 2016-12-13 00:02:03 +01:00
parent 1a70a4e8b2
commit 85af7c57b6
14 changed files with 503 additions and 267 deletions

View File

@ -6,10 +6,51 @@
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.util.HashMap;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.util.ContentStream;
import org.openide.util.Exceptions;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException;
public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> { public class ArtifactExtractor extends TextExtractor<Void, BlackboardArtifact> {
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
Content dataSource;
Case currentCase;
try {
currentCase = Case.getCurrentCase();
} catch (IllegalStateException ignore) {
// thorown by Case.getCurrentCase() if currentCase is null
return null;
}
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
if (sleuthkitCase == null) {
return null;
}
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
if (abstractFile != null) {
dataSource = abstractFile.getDataSource();
} else {
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
}
if (dataSource == null) {
return null;
}
return dataSource;
}
@Override @Override
boolean noExtractionOptionsAreEnabled() { boolean noExtractionOptionsAreEnabled() {
@ -27,13 +68,99 @@ public class ArtifactExtractor extends TextProvider<Void, BlackboardArtifact> {
} }
@Override @Override
InputStream getInputStream(BlackboardArtifact source) { InputStream getInputStream(BlackboardArtifact artifact) {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
Content dataSource;
try {
dataSource = getDataSource(artifact);
if (dataSource == null) {
return null;
}
for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : ");
// This is ugly since it will need to updated any time a new
// TSK_DATETIME_* attribute is added. A slightly less ugly
// alternative would be to assume that all date time attributes
// will have a name of the form "TSK_DATETIME*" and check
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
// The major problem with that approach is that it would require
// a round trip to the database to get the type name string.
// We have also discussed modifying BlackboardAttribute.getDisplayString()
// to magically format datetime attributes but that is complicated by
// the fact that BlackboardAttribute exists in Sleuthkit data model
// while the utility to determine the timezone to use is in ContentUtils
// in the Autopsy datamodel.
if (attribute.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
} else {
artifactContents.append(attribute.getDisplayString());
}
artifactContents.append(System.lineSeparator());
}
} catch (TskCoreException ex) {
Exceptions.printStackTrace(ex);
return null;
}
if (artifactContents.length() == 0) {
return null;
}
// To play by the rules of the existing text markup implementations,
// we need to (a) index the artifact contents in a "chunk" and
// (b) create a separate index entry for the base artifact.
// We distinguish artifact content from file content by applying a
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
// First, create an index entry for the base artifact.
HashMap<String, String> solrFields = new HashMap<>();
String documentId = Long.toString(artifact.getArtifactID());
solrFields.put(Server.Schema.ID.toString(), documentId);
// Set the IMAGE_ID field.
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
// Next create the index entry for the document content.
// The content gets added to a single chunk. We may need to add chunking
// support later.
long chunkId = 1;
documentId += "_" + Long.toString(chunkId);
solrFields.replace(Server.Schema.ID.toString(), documentId);
return IOUtils.toInputStream(artifactContents);
} }
@Override @Override
Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException { Reader getReader(InputStream stream, BlackboardArtifact source, Void appendix) throws Ingester.IngesterException {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. return new InputStreamReader(stream);
}
@Override
long getID(BlackboardArtifact source) {
return source.getArtifactID();
}
@Override
ContentStream getContentStream(byte[] encodedBytes, int length, BlackboardArtifact source) {
return new ByteArtifactStream(encodedBytes, length, source);
}
@Override
ContentStream getNullStream(BlackboardArtifact source) {
return new Ingester.NullArtifactStream(source);
}
@Override
String getName(BlackboardArtifact source) {
return source.getDisplayName();
} }
} }

View File

@ -0,0 +1,100 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import org.apache.solr.common.util.ContentStream;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import static org.sleuthkit.autopsy.keywordsearch.Bundle.*;
import org.sleuthkit.datamodel.BlackboardArtifact;
/**
* Stream of bytes representing string with specified encoding to feed into Solr
* as ContentStream
*/
class ByteArtifactStream implements ContentStream {
//input
private final byte[] content; //extracted subcontent
private long contentSize;
private final BlackboardArtifact aContent; //origin
private final InputStream stream;
private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName());
public ByteArtifactStream(byte[] content, long contentSize, BlackboardArtifact aContent) {
this.content = content;
this.aContent = aContent;
stream = new ByteArrayInputStream(content, 0, (int) contentSize);
}
public byte[] getByteContent() {
return content;
}
public BlackboardArtifact getSourceContent() {
return aContent;
}
@Override
public String getContentType() {
return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS
}
@Override
public String getName() {
return aContent.getDisplayName();
}
@Override
public Reader getReader() throws IOException {
return new InputStreamReader(stream);
}
@Override
public Long getSize() {
return contentSize;
}
@Override
@NbBundle.Messages("ByteArtifactStream.getSrcInfo.text=Artifact:{0}")
public String getSourceInfo() {
return ByteArtifactStream_getSrcInfo_text(aContent.getArtifactID());
}
@Override
public InputStream getStream() throws IOException {
return stream;
}
@Override
protected void finalize() throws Throwable {
super.finalize();
stream.close();
}
}

View File

@ -0,0 +1,124 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.util.Arrays;
import java.util.List;
import org.apache.solr.common.util.ContentStream;
import org.sleuthkit.datamodel.AbstractFile;
/**
* Common methods for utilities that extract text and content and divide into
* chunks
*/
abstract class FileTextExtractor<AppendixProvider> extends TextExtractor<AppendixProvider, AbstractFile> {
/**
* Common options that can be used by some extractors
*/
enum ExtractOptions {
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
};
static final List<String> BLOB_MIME_TYPES
= Arrays.asList(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
static final List<String> ARCHIVE_MIME_TYPES
= Arrays.asList(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/**
* Determines if the extractor works only for specified types is
* supportedTypes() or whether is a generic content extractor (such as
* string extractor)
*
* @return
*/
abstract boolean isContentTypeSpecific();
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
*
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/
abstract boolean isSupported(AbstractFile file, String detectedFormat);
@Override
long getID(AbstractFile source) {
return source.getId();
}
@Override
ContentStream getContentStream(byte[] encodedBytes, int length, AbstractFile source) {
return new ByteContentStream(encodedBytes, length, source);
}
@Override
ContentStream getNullStream(AbstractFile source) {
return new Ingester.NullContentStream(source);
}
@Override
String getName(AbstractFile source) {
return source.getName();
}
}

View File

@ -39,7 +39,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
* divided into chunks and indexed with Solr. If HTML extraction succeeds, * divided into chunks and indexed with Solr. If HTML extraction succeeds,
* chunks are indexed with Solr. * chunks are indexed with Solr.
*/ */
class HtmlTextExtractor extends TextExtractor<Void> { class HtmlTextExtractor extends FileTextExtractor<Void> {
private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());

View File

@ -36,14 +36,16 @@ import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.datamodel.AbstractContent; import org.sleuthkit.datamodel.AbstractContent;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ContentVisitor;
import org.sleuthkit.datamodel.DerivedFile; import org.sleuthkit.datamodel.DerivedFile;
import org.sleuthkit.datamodel.Directory; import org.sleuthkit.datamodel.Directory;
import org.sleuthkit.datamodel.File; import org.sleuthkit.datamodel.File;
import org.sleuthkit.datamodel.LayoutFile; import org.sleuthkit.datamodel.LayoutFile;
import org.sleuthkit.datamodel.LocalFile; import org.sleuthkit.datamodel.LocalFile;
import org.sleuthkit.datamodel.SlackFile; import org.sleuthkit.datamodel.SlackFile;
import org.sleuthkit.datamodel.SleuthkitItemVisitor;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
/** /**
@ -99,6 +101,11 @@ class Ingester {
indexContentStream(new NullContentStream(file), getContentFields(file), 0); indexContentStream(new NullContentStream(file), getContentFields(file), 0);
} }
void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
// indexContentStream(new NullContentStream(artifact), getContentFields(file), 0);
}
/** /**
* Sends a TextExtractor to Solr to have its content extracted and added to * Sends a TextExtractor to Solr to have its content extracted and added to
* the index. commit() should be called once you're done ingesting files. * the index. commit() should be called once you're done ingesting files.
@ -117,6 +124,12 @@ class Ingester {
indexContentStream(new NullContentStream(file), params, 0); indexContentStream(new NullContentStream(file), params, 0);
} }
private void recordNumberOfChunks(BlackboardArtifact artifact, int numChunks) throws IngesterException {
Map<String, String> params = getContentFields(artifact);
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
indexContentStream(new NullArtifactStream(artifact), params, 0);
}
/** /**
* Creates a field map from FsContent, that is later sent to Solr * Creates a field map from FsContent, that is later sent to Solr
* *
@ -124,19 +137,14 @@ class Ingester {
* *
* @return the map * @return the map
*/ */
Map<String, String> getContentFields(AbstractContent fsc) { Map<String, String> getContentFields(SleuthkitVisitableItem fsc) {
return fsc.accept(getContentFieldsV); return fsc.accept(getContentFieldsV);
} }
/** /**
* Visitor used to create param list to send to SOLR index. * Visitor used to create param list to send to SOLR index.
*/ */
static private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> { static private class GetContentFieldsV extends SleuthkitItemVisitor.Default<Map<String, String>> {
@Override
protected Map<String, String> defaultVisit(Content cntnt) {
return new HashMap<>();
}
@Override @Override
public Map<String, String> visit(File f) { public Map<String, String> visit(File f) {
@ -201,21 +209,46 @@ class Ingester {
params.put(Server.Schema.FILE_NAME.toString(), af.getName()); params.put(Server.Schema.FILE_NAME.toString(), af.getName());
return params; return params;
} }
@Override
public Map<String, String> visit(BlackboardArtifact artifact) {
Map<String, String> params = new HashMap<>();
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
try {
Content dataSource = ArtifactExtractor.getDataSource(artifact);
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact {0}", artifact.getArtifactID()); //NON-NLS
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
}
return params;
}
@Override
protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
return new HashMap<>();
}
} }
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
private static final int SINGLE_READ_CHARS = 1024; private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace private static final int EXTRA_CHARS = 128; //for whitespace
public <T> boolean indexText(TextExtractor<T> extractor, AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException { public <A, T extends SleuthkitVisitableItem> boolean indexText(TextExtractor<A, T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
int numChunks = 0; //unknown until chunking is done int numChunks = 0; //unknown until chunking is done
if (extractor.noExtractionOptionsAreEnabled()) { if (extractor.noExtractionOptionsAreEnabled()) {
return true; return true;
} }
T appendix = extractor.newAppendixProvider(); final long sourceID = extractor.getID(source);
try (final InputStream stream = extractor.getInputStream(sourceFile); final String sourceName = extractor.getName(source);
Reader reader = extractor.getReader(stream, sourceFile, appendix);) { Map<String, String> fields = getContentFields(source);
A appendix = extractor.newAppendixProvider();
try (final InputStream stream = extractor.getInputStream(source);
Reader reader = extractor.getReader(stream, source, appendix);) {
//we read max 1024 chars at time, this seems to max what this Reader would return //we read max 1024 chars at time, this seems to max what this Reader would return
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
@ -265,10 +298,10 @@ class Ingester {
//encode to bytes as UTF-8 to index as byte stream //encode to bytes as UTF-8 to index as byte stream
byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET); byte[] encodedBytes = chunkString.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
String chunkId = Server.getChunkIdString(sourceFile.getId(), numChunks + 1);
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
try { try {
ByteContentStream bcs = new ByteContentStream(encodedBytes, encodedBytes.length, sourceFile); ContentStream bcs = extractor.getContentStream(encodedBytes, encodedBytes.length, source);
Map<String, String> fields = getContentFields(sourceFile);
try { try {
indexContentStream(bcs, fields, encodedBytes.length); indexContentStream(bcs, fields, encodedBytes.length);
} catch (Exception ex) { } catch (Exception ex) {
@ -277,20 +310,21 @@ class Ingester {
numChunks++; numChunks++;
} catch (Ingester.IngesterException ingEx) { } catch (Ingester.IngesterException ingEx) {
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);//NON-NLS + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
throw ingEx; //need to rethrow to signal error and move on throw ingEx; //need to rethrow to signal error and move on
} }
} }
} catch (IOException ex) { } catch (IOException ex) {
extractor.logWarning("Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false; return false;
} catch (Exception ex) { } catch (Exception ex) {
extractor.logWarning("Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);//NON-NLS extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false; return false;
} finally { } finally {
//after all chunks, ingest the parent file without content itself, and store numChunks //after all chunks, ingest the parent file without content itself, and store numChunks
recordNumberOfChunks(sourceFile, numChunks); fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
indexContentStream(extractor.getNullStream(source), fields, 0);
} }
return true; return true;
} }
@ -442,7 +476,7 @@ class Ingester {
/** /**
* ContentStream associated with FsContent, but forced with no content * ContentStream associated with FsContent, but forced with no content
*/ */
private static class NullContentStream implements ContentStream { static class NullContentStream implements ContentStream {
AbstractContent aContent; AbstractContent aContent;
@ -482,6 +516,50 @@ class Ingester {
} }
} }
/**
* ContentStream associated with Artifact, but forced with no content
*/
static class NullArtifactStream implements ContentStream {
BlackboardArtifact aContent;
NullArtifactStream(BlackboardArtifact aContent) {
this.aContent = aContent;
}
@Override
public String getName() {
return aContent.getDisplayName();
}
@NbBundle.Messages("Ingester.NullArtifactStream.getSrcInfo.text=File:{0})\n")
@Override
public String getSourceInfo() {
return Bundle.Ingester_NullArtifactStream_getSrcInfo_text(aContent.getArtifactID());
}
@Override
public String getContentType() {
return null;
}
@Override
public Long getSize() {
return 0L;
}
@Override
public InputStream getStream() throws IOException {
return new ByteArrayInputStream(new byte[0]);
}
@Override
public Reader getReader() throws IOException {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
}
}
/** /**
* Indicates that there was an error with the specific ingest operation, but * Indicates that there was an error with the specific ingest operation, but
* it's still okay to continue ingesting files. * it's still okay to continue ingesting files.

View File

@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
private void reloadScriptsCheckBoxes() { private void reloadScriptsCheckBoxes() {
boolean utf16 boolean utf16
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
enableUTF16Checkbox.setSelected(utf16); enableUTF16Checkbox.setSelected(utf16);
boolean utf8 boolean utf8
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
enableUTF8Checkbox.setSelected(utf8); enableUTF8Checkbox.setSelected(utf8);
final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts(); final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
reloadScriptsCheckBoxes(); reloadScriptsCheckBoxes();
boolean utf16 boolean utf16
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
enableUTF16Checkbox.setSelected(utf16); enableUTF16Checkbox.setSelected(utf16);
boolean utf8 boolean utf8
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); = Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
enableUTF8Checkbox.setSelected(utf8); enableUTF8Checkbox.setSelected(utf8);
final boolean extractEnabled = utf16 || utf8; final boolean extractEnabled = utf16 || utf8;
@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
@Override @Override
public void store() { public void store() {
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
Boolean.toString(enableUTF8Checkbox.isSelected())); Boolean.toString(enableUTF8Checkbox.isSelected()));
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
Boolean.toString(enableUTF16Checkbox.isSelected())); Boolean.toString(enableUTF16Checkbox.isSelected()));
if (toUpdate != null) { if (toUpdate != null) {

View File

@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//accessed read-only by searcher thread //accessed read-only by searcher thread
private boolean startedSearching = false; private boolean startedSearching = false;
private List<TextExtractor<?>> textExtractors; private List<FileTextExtractor<?>> textExtractors;
private StringsTextExtractor stringExtractor; private StringsTextExtractor stringExtractor;
private final KeywordSearchJobSettings settings; private final KeywordSearchJobSettings settings;
private boolean initialized = false; private boolean initialized = false;
@ -415,10 +415,10 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* @throws IngesterException exception thrown if indexing failed * @throws IngesterException exception thrown if indexing failed
*/ */
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException { private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
TextExtractor extractor = null; FileTextExtractor extractor = null;
//go over available text extractors in order, and pick the first one (most specific one) //go over available text extractors in order, and pick the first one (most specific one)
for (TextExtractor fe : textExtractors) { for (FileTextExtractor fe : textExtractors) {
if (fe.isSupported(aFile, detectedFormat)) { if (fe.isSupported(aFile, detectedFormat)) {
extractor = fe; extractor = fe;
break; break;
@ -514,7 +514,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
// we skip archive formats that are opened by the archive module. // we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though... // @@@ We could have a check here to see if the archive module was enabled though...
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) { if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
try { try {
if (context.fileIngestIsCancelled()) { if (context.fileIngestIsCancelled()) {
return; return;

View File

@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
} }
private void displayEncodings() { private void displayEncodings() {
String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()); String utf8 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()); String utf16 = KeywordSearchSettings.getStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
ArrayList<String> encodingsList = new ArrayList<>(); ArrayList<String> encodingsList = new ArrayList<>();
if (utf8 == null || Boolean.parseBoolean(utf8)) { if (utf8 == null || Boolean.parseBoolean(utf8)) {
encodingsList.add("UTF8"); encodingsList.add("UTF8");

View File

@ -211,14 +211,14 @@ class KeywordSearchSettings {
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT); KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
} }
//setting default Extract UTF8 //setting default Extract UTF8
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) { if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString()); KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
} }
//setting default Extract UTF16 //setting default Extract UTF16
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) { if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString()); KeywordSearchSettings.setStringExtractOption(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
} }
//setting default Latin-1 Script //setting default Latin-1 Script
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) { if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {

View File

@ -20,22 +20,14 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException; import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.util.HashMap;
import java.util.MissingResourceException; import java.util.MissingResourceException;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.common.util.ContentStreamBase.StringStream;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.lookup.ServiceProvider; import org.openide.util.lookup.ServiceProvider;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
/** /**
@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
private static final int IS_REACHABLE_TIMEOUT_MS = 1000; private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
ArtifactExtractor extractor = new ArtifactExtractor();
@Override @Override
public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException { public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
if (artifact == null) { if (artifact == null) {
@ -57,109 +51,18 @@ public class SolrSearchService implements KeywordSearchService {
// We only support artifact indexing for Autopsy versions that use // We only support artifact indexing for Autopsy versions that use
// the negative range for artifact ids. // the negative range for artifact ids.
long artifactId = artifact.getArtifactID(); if (artifact.getArtifactID() > 0) {
if (artifactId > 0) {
return; return;
} }
Case currentCase;
try {
currentCase = Case.getCurrentCase();
} catch (IllegalStateException ignore) {
// thorown by Case.getCurrentCase() if currentCase is null
return;
}
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
if (sleuthkitCase == null) {
return;
}
Content dataSource;
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
if (abstractFile != null) {
dataSource = abstractFile.getDataSource();
} else {
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
}
if (dataSource == null) {
return;
}
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : ");
// This is ugly since it will need to updated any time a new
// TSK_DATETIME_* attribute is added. A slightly less ugly
// alternative would be to assume that all date time attributes
// will have a name of the form "TSK_DATETIME*" and check
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
// The major problem with that approach is that it would require
// a round trip to the database to get the type name string.
// We have also discussed modifying BlackboardAttribute.getDisplayString()
// to magically format datetime attributes but that is complicated by
// the fact that BlackboardAttribute exists in Sleuthkit data model
// while the utility to determine the timezone to use is in ContentUtils
// in the Autopsy datamodel.
if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
} else {
artifactContents.append(attribute.getDisplayString());
}
artifactContents.append(System.lineSeparator());
}
if (artifactContents.length() == 0) {
return;
}
// To play by the rules of the existing text markup implementations,
// we need to (a) index the artifact contents in a "chunk" and
// (b) create a separate index entry for the base artifact.
// We distinguish artifact content from file content by applying a
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
// First, create an index entry for the base artifact.
HashMap<String, String> solrFields = new HashMap<>();
String documentId = Long.toString(artifactId);
solrFields.put(Server.Schema.ID.toString(), documentId);
// Set the IMAGE_ID field.
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
try { try {
Ingester.getDefault().indexContentStream(new StringStream(""), solrFields, 0); Ingester.getDefault().indexMetaDataOnly(artifact);
} catch (Ingester.IngesterException ex) { } catch (Ingester.IngesterException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }
// Next create the index entry for the document content.
// The content gets added to a single chunk. We may need to add chunking
// support later.
long chunkId = 1;
documentId += "_" + Long.toString(chunkId);
solrFields.replace(Server.Schema.ID.toString(), documentId);
StringStream contentStream = new StringStream(artifactContents.toString());
try { try {
Ingester.getDefault().indexContentStream(contentStream, solrFields, contentStream.getSize()); Ingester.getDefault().indexText(extractor, artifact);
} catch (Ingester.IngesterException ex) { } catch (Ingester.IngesterException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }

View File

@ -38,7 +38,7 @@ import org.sleuthkit.datamodel.TskException;
* with the original source file) up to 1MB then and indexes chunks as text with * with the original source file) up to 1MB then and indexes chunks as text with
* Solr. * Solr.
*/ */
class StringsTextExtractor extends TextExtractor<Void> { class StringsTextExtractor extends FileTextExtractor<Void> {
private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
@ -94,8 +94,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
@Override @Override
boolean noExtractionOptionsAreEnabled() { boolean noExtractionOptionsAreEnabled() {
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
return extractUTF8 == false && extractUTF16 == false; return extractUTF8 == false && extractUTF16 == false;
} }
@ -120,8 +120,8 @@ class StringsTextExtractor extends TextExtractor<Void> {
*/ */
@Override @Override
InputStream getInputStream(AbstractFile sourceFile) { InputStream getInputStream(AbstractFile sourceFile) {
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(FileTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
//check which extract stream to use //check which extract stream to use
InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1) InputStream stringStream = extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)

View File

@ -1,7 +1,7 @@
/* /*
* Autopsy Forensic Browser * Autopsy Forensic Browser
* *
* Copyright 2011-2016 Basis Technology Corp. * Copyright 2011-16 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org * Contact: carrier <at> sleuthkit <dot> org
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -18,89 +18,30 @@
*/ */
package org.sleuthkit.autopsy.keywordsearch; package org.sleuthkit.autopsy.keywordsearch;
import java.util.Arrays; import java.io.InputStream;
import java.util.List; import java.io.Reader;
import org.sleuthkit.datamodel.AbstractFile; import org.apache.solr.common.util.ContentStream;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
/** abstract class TextExtractor<AppendixProvider, TextSource extends SleuthkitVisitableItem> {
* Common methods for utilities that extract text and content and divide into
* chunks
*/
abstract class TextExtractor<AppendixProvider> extends TextProvider<AppendixProvider, AbstractFile> {
/** abstract boolean noExtractionOptionsAreEnabled();
* Common options that can be used by some extractors
*/
enum ExtractOptions {
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString() abstract void logWarning(final String msg, Exception ex);
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
};
static final List<String> BLOB_MIME_TYPES void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
= Arrays.asList( //no-op
//ignore binary blob data, for which string extraction will be used }
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking abstract AppendixProvider newAppendixProvider();
* modules take care of them */
static final List<String> ARCHIVE_MIME_TYPES
= Arrays.asList(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/** abstract InputStream getInputStream(TextSource source);
* Determines if the extractor works only for specified types is
* supportedTypes() or whether is a generic content extractor (such as
* string extractor)
*
* @return
*/
abstract boolean isContentTypeSpecific();
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
*
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/
abstract boolean isSupported(AbstractFile file, String detectedFormat);
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
abstract long getID(TextSource source);
abstract ContentStream getContentStream(byte[] encodedBytes, int length, TextSource source);
abstract String getName(TextSource source);
abstract ContentStream getNullStream(TextSource source);
} }

View File

@ -1,39 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-16 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream;
import java.io.Reader;
abstract class TextProvider<AppendixProvider, TextSource> {
abstract boolean noExtractionOptionsAreEnabled();
abstract void logWarning(final String msg, Exception ex);
void appendDataToFinalChunk(StringBuilder sb, AppendixProvider dataProvider) {
//no-op
}
abstract AppendixProvider newAppendixProvider();
abstract InputStream getInputStream(TextSource source);
abstract Reader getReader(InputStream stream, TextSource source, AppendixProvider appendix) throws Ingester.IngesterException;
}

View File

@ -49,7 +49,7 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
* parsers-supported content type. * parsers-supported content type.
* *
*/ */
class TikaTextExtractor extends TextExtractor<Metadata> { class TikaTextExtractor extends FileTextExtractor<Metadata> {
private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName()); private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
@ -110,8 +110,8 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
@Override @Override
public boolean isSupported(AbstractFile file, String detectedFormat) { public boolean isSupported(AbstractFile file, String detectedFormat) {
if (detectedFormat == null if (detectedFormat == null
|| TextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used) || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|| TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat) || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS || detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
@ -123,6 +123,7 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
return TIKA_SUPPORTED_TYPES.contains(detectedFormat); return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
} }
@Override
InputStream getInputStream(AbstractFile sourceFile1) { InputStream getInputStream(AbstractFile sourceFile1) {
return new ReadContentInputStream(sourceFile1); return new ReadContentInputStream(sourceFile1);
} }
@ -131,4 +132,5 @@ class TikaTextExtractor extends TextExtractor<Metadata> {
boolean noExtractionOptionsAreEnabled() { boolean noExtractionOptionsAreEnabled() {
return false; return false;
} }
} }