Renamed TextExtractor public API

This commit is contained in:
U-BASIS\dsmyda 2018-12-19 14:06:26 -05:00
parent 55f7003246
commit 1e1e166f55
15 changed files with 251 additions and 244 deletions

View File

@ -338,8 +338,8 @@
<package>org.sleuthkit.autopsy.modules.vmextractor</package> <package>org.sleuthkit.autopsy.modules.vmextractor</package>
<package>org.sleuthkit.autopsy.progress</package> <package>org.sleuthkit.autopsy.progress</package>
<package>org.sleuthkit.autopsy.report</package> <package>org.sleuthkit.autopsy.report</package>
<package>org.sleuthkit.autopsy.textreaders</package> <package>org.sleuthkit.autopsy.textextractors</package>
<package>org.sleuthkit.autopsy.textreaders.textreaderconfigs</package> <package>org.sleuthkit.autopsy.textextractors.textextractorconfigs</package>
<package>org.sleuthkit.autopsy.texttranslation</package> <package>org.sleuthkit.autopsy.texttranslation</package>
<package>org.sleuthkit.datamodel</package> <package>org.sleuthkit.datamodel</package>
</public-packages> </public-packages>

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders; package org.sleuthkit.autopsy.textextractors;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
@ -83,7 +83,7 @@ class ArtifactTextExtractor extends TextExtractor {
} }
@Override @Override
public boolean isSupported(Content file, String detectedFormat) { public boolean isSupported() {
return true; return true;
} }
} }

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders; package org.sleuthkit.autopsy.textextractors;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -32,6 +32,7 @@ import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType; import net.htmlparser.jericho.StartTagType;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
@ -42,7 +43,7 @@ final class HtmlTextExtractor extends TextExtractor {
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private final int MAX_SIZE; private final int MAX_SIZE;
private final Content file; private final AbstractFile file;
static final List<String> WEB_MIME_TYPES = Arrays.asList( static final List<String> WEB_MIME_TYPES = Arrays.asList(
"application/javascript", //NON-NLS "application/javascript", //NON-NLS
@ -62,7 +63,7 @@ final class HtmlTextExtractor extends TextExtractor {
* Creates a default instance of the HtmlTextExtractor. Supported file size * Creates a default instance of the HtmlTextExtractor. Supported file size
* is 50MB. * is 50MB.
*/ */
public HtmlTextExtractor(Content file) { public HtmlTextExtractor(AbstractFile file) {
//Set default to be 50 MB. //Set default to be 50 MB.
MAX_SIZE = 50_000_000; MAX_SIZE = 50_000_000;
this.file = file; this.file = file;
@ -77,10 +78,10 @@ final class HtmlTextExtractor extends TextExtractor {
* @return flag indicating support * @return flag indicating support
*/ */
@Override @Override
public boolean isSupported(Content content, String detectedFormat) { public boolean isSupported() {
return detectedFormat != null return file.getMIMEType() != null
&& WEB_MIME_TYPES.contains(detectedFormat) && WEB_MIME_TYPES.contains(file.getMIMEType())
&& content.getSize() <= MAX_SIZE; && file.getSize() <= MAX_SIZE;
} }
/** /**

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders; package org.sleuthkit.autopsy.textextractors;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader; import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
/** /**
* Extracts text from SQLite database files. * Extracts text from SQLite database files.
@ -45,8 +44,8 @@ final class SqliteTextExtractor extends TextExtractor {
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName()); private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
private final AbstractFile file; private final AbstractFile file;
public SqliteTextExtractor(Content file) { public SqliteTextExtractor(AbstractFile file) {
this.file = (AbstractFile) file; this.file = file;
} }
/** /**
* Supports only the sqlite mimetypes * Supports only the sqlite mimetypes
@ -57,8 +56,8 @@ final class SqliteTextExtractor extends TextExtractor {
* @return true if x-sqlite3 * @return true if x-sqlite3
*/ */
@Override @Override
public boolean isSupported(Content file, String detectedFormat) { public boolean isSupported() {
return SQLITE_MIMETYPE.equals(detectedFormat); return SQLITE_MIMETYPE.equals(file.getMIMEType());
} }
/** /**

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders; package org.sleuthkit.autopsy.textextractors;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -28,7 +28,7 @@ import java.util.Objects;
import org.openide.util.Lookup; import org.openide.util.Lookup;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig; import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskException; import org.sleuthkit.datamodel.TskException;
@ -36,7 +36,7 @@ import org.sleuthkit.datamodel.TskException;
/** /**
* Extracts raw strings from content. * Extracts raw strings from content.
*/ */
final class StringsTextExtractor { final class StringsTextExtractor extends TextExtractor {
private boolean extractUTF8; private boolean extractUTF8;
private boolean extractUTF16; private boolean extractUTF16;
@ -81,6 +81,7 @@ final class StringsTextExtractor {
* @throws * @throws
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException * org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
*/ */
@Override
public InputStreamReader getReader() { public InputStreamReader getReader() {
InputStream stringStream = getInputStream(content); InputStream stringStream = getInputStream(content);
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET)); return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
@ -104,6 +105,7 @@ final class StringsTextExtractor {
* *
* @param context Lookup instance containing config classes * @param context Lookup instance containing config classes
*/ */
@Override
public void setExtractionSettings(Lookup context) { public void setExtractionSettings(Lookup context) {
if (context != null) { if (context != null) {
StringsConfig configInstance = context.lookup(StringsConfig.class); StringsConfig configInstance = context.lookup(StringsConfig.class);
@ -126,14 +128,11 @@ final class StringsTextExtractor {
* *
* @return * @return
*/ */
public boolean isEnabled() { @Override
public boolean isSupported() {
return extractUTF8 || extractUTF16; return extractUTF8 || extractUTF16;
} }
boolean isSupported(Content file, String detectedFormat) {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
/** /**
* Content input string stream reader/converter - given Content, extract * Content input string stream reader/converter - given Content, extract
* strings from it and return encoded bytes via read() * strings from it and return encoded bytes via read()

View File

@ -16,38 +16,26 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders; package org.sleuthkit.autopsy.textextractors;
import java.io.Reader; import java.io.Reader;
import org.openide.util.Lookup; import org.openide.util.Lookup;
import org.sleuthkit.datamodel.Content;
/** /**
* Extracts the text out of Content instances and exposes them as a Reader. * Extracts the text out of Content instances and exposes them as a Reader.
* Concrete implementations can be obtained from * Concrete implementations can be obtained from
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory} * {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}
*/ */
abstract class TextExtractor { public abstract class TextExtractor {
/** /**
* Determines if the file content is supported by the extractor. * Determines if this extractor supports the given Content and
* configurations passed into it in
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}.
* *
* @param file to test if its content should be supported * @return true if content is supported, false otherwise
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/ */
abstract boolean isSupported(Content file, String detectedFormat); abstract boolean isSupported();
/**
* Determines if the TextExtractor instance is enabled to read content.
*
* @return
*/
boolean isEnabled() {
return true;
}
/** /**
* Get a Reader that will iterate over the text extracted from the Content * Get a Reader that will iterate over the text extracted from the Content
@ -75,8 +63,7 @@ abstract class TextExtractor {
} }
/** /**
* Exception encountered during * Exception encountered during TextExtractor.getReader().
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
* This indicates that there was an internal parsing error that occurred * This indicates that there was an internal parsing error that occurred
* during the reading of Content text. * during the reading of Content text.
*/ */

View File

@ -0,0 +1,160 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2018-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textextractors;
import java.util.Arrays;
import java.util.List;
import org.openide.util.Lookup;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Report;
/**
* Factory for creating TextExtractors given a Content instance
*
* See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for
* available extractor configuration options.
*
* @see org.openide.util.Lookup
*/
public class TextExtractorFactory {
/**
* Returns a TextExtractor containing the Content text. Configuration files
* can be added to the Lookup.
*
* See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for
* available extractor configuration options.
*
* @param content Content source that will be read from
* @param context Contains extraction configurations for certain file types
*
* @return TextExtractor containing file text
*
* @throws NoTextExtractorFound Encountered when there is no Reader found
* for the given content type or there was an
* error while creating the reader.
*
* @see org.openide.util.Lookup
*/
public static TextExtractor getExtractor(Content content, Lookup context) throws NoTextExtractorFound {
if (content instanceof AbstractFile) {
for (TextExtractor extractor : getFileExtractors((AbstractFile) content, context)) {
if (extractor.isSupported()) {
return extractor;
}
}
} else if (content instanceof BlackboardArtifact) {
TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
artifactExtractor.setExtractionSettings(context);
return artifactExtractor;
} else if (content instanceof Report) {
TextExtractor reportExtractor = new TikaTextExtractor(content);
reportExtractor.setExtractionSettings(context);
return reportExtractor;
}
throw new NoTextExtractorFound(
String.format("Could not find a suitable reader for "
+ "content with name [%s] and id=[%d]. Try using "
+ "the strings extractor instead.",
content.getName(), content.getId())
);
}
/**
* Initializes, orders, and returns all file extractors that can read
* AbstractFile instances.
*
* @param content AbstractFile content
* @param context Lookup containing extractor configurations
*
* @return
*/
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
List<TextExtractor> fileExtractors = Arrays.asList(
new HtmlTextExtractor(content),
new SqliteTextExtractor(content),
new TikaTextExtractor(content));
fileExtractors.forEach((fileExtractor) -> {
fileExtractor.setExtractionSettings(context);
});
return fileExtractors;
}
/**
* Returns a TextExtractor containing the Content text.
*
* @param content Content instance that will be read from
*
* @return TextExtractor containing file text
*
* @throws NoTextExtractorFound Encountered when there is no Reader was
* found for the given content type. Use
* getStringsExtractor(Content,Lookup) method
* instead.
*/
public static TextExtractor getExtractor(Content content) throws NoTextExtractorFound {
return TextExtractorFactory.getExtractor(content, null);
}
/**
* Returns a TextExtractor containing the Content strings. This method
* supports all content types. This method should be used as a backup in the
* event that no reader was found using getExtractor(Content) or
* getExtractor(Content, Lookup).
*
* Configure this extractor with the StringsConfig in
* {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs}
*
* @param content Content source to read from
* @param context Contains extraction configurations for certain file types
*
* @return TextExtractor containing file text
*
* @see org.openide.util.Lookup
*/
public static TextExtractor getStringsExtractor(Content content, Lookup context) {
StringsTextExtractor stringsInstance = new StringsTextExtractor(content);
stringsInstance.setExtractionSettings(context);
return stringsInstance;
}
/**
* System level exception for handling content types that have no specific
* strategy defined for extracting their text.
*/
public static class NoTextExtractorFound extends Exception {
public NoTextExtractorFound(String msg) {
super(msg);
}
public NoTextExtractorFound(Throwable ex) {
super(ex);
}
private NoTextExtractorFound(String msg, Throwable ex) {
super(msg, ex);
}
}
}

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders; package org.sleuthkit.autopsy.textextractors;
import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList;
import com.google.common.io.CharSource; import com.google.common.io.CharSource;
@ -61,7 +61,7 @@ import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.coreutils.ExecUtil; import org.sleuthkit.autopsy.coreutils.ExecUtil;
import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator; import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
import org.sleuthkit.autopsy.coreutils.PlatformUtil; import org.sleuthkit.autopsy.coreutils.PlatformUtil;
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig; import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig;
import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
@ -318,7 +318,7 @@ final class TikaTextExtractor extends TextExtractor {
} }
} }
} }
/** /**
* Wraps the creation of a TikaReader into a Future so that it can be * Wraps the creation of a TikaReader into a Future so that it can be
* cancelled. * cancelled.
@ -422,24 +422,27 @@ final class TikaTextExtractor extends TextExtractor {
} }
/** /**
* Determines if Tika is supported for this content type and mimetype. * Determines if Tika is enabled for this content
*
* @param content Source content to read
* @param detectedFormat Mimetype of content
* *
* @return Flag indicating support for reading content type * @return Flag indicating support for reading content type
*/ */
@Override @Override
public boolean isSupported(Content content, String detectedFormat) { public boolean isSupported() {
if (detectedFormat == null if(!(content instanceof AbstractFile)) {
|| BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used) return false;
|| ARCHIVE_MIME_TYPES.contains(detectedFormat) }
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS String detectedType = ((AbstractFile)content).getMIMEType();
if (detectedType == null
|| BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
|| ARCHIVE_MIME_TYPES.contains(detectedType)
|| (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
) { ) {
return false; return false;
} }
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
return TIKA_SUPPORTED_TYPES.contains(detectedType);
} }
/** /**

View File

@ -16,11 +16,11 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders.textreaderconfigs; package org.sleuthkit.autopsy.textextractors.textextractorconfigs;
/** /**
* Allows for configuration of OCR on image files. Readers that use ImageConfig * Allows for configuration of OCR on image files. Extractors that use ImageConfig
* can be obtained through {@link org.sleuthkit.autopsy.textreaders.TextReaders} * can be obtained through TextExtractoryFactory.getExtractor().
* *
* @see org.openide.util.Lookup * @see org.openide.util.Lookup
*/ */

View File

@ -16,20 +16,20 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.sleuthkit.autopsy.textreaders.textreaderconfigs; package org.sleuthkit.autopsy.textextractors.textextractorconfigs;
import java.util.List; import java.util.List;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
/** /**
* Allows for configuration of the Reader obtained from * Allows for configuration of the TextExtractor obtained from
* {@link org.sleuthkit.autopsy.textreaders.TextReader#getStringsReader(Content, Lookup)}. * TextExtractorFactory.getExtractor().
* *
* The strings reader will read strings from the Content instance. This class * The strings extractor will extract strings from the Content instance. This class
* allows for the configuration of the encoding and language scripts used during * allows for the configuration of the encoding and language scripts used during
* reading. * reading.
* *
* @see org.sleuthkit.autopsy.textreaders.TextReaders * @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
* @see * @see
* org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT * org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
* @see org.openide.util.Lookup * @see org.openide.util.Lookup
@ -77,8 +77,8 @@ public class StringsConfig {
} }
/** /**
* Sets the type of language scripts that will be used during this * Sets the type of language scripts that will be used during this reading.
* reading. See * See
* {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT} * {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT}
* for more information about available scripts. * for more information about available scripts.
* *

View File

@ -1,152 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2018-2018 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.textreaders;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
import org.openide.util.Lookup;
import org.sleuthkit.autopsy.textreaders.TextExtractor.ExtractionException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Report;
/**
* Factory for creating Readers given a Content instance
*
* See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for available
* Reader configuration options.
*
* @see org.openide.util.Lookup
*/
public class TextReaders {
/**
* Returns a reader containing the Content text. Configuration files can be
* added to the Lookup.
*
* See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for
* available Reader configuration options.
*
* @param content Content source that will be read from
* @param context Contains extraction configurations for certain file types
*
* @return Reader containing file text
*
* @throws NoTextReaderFound Encountered when there is no Reader found for
* the given content type or there was an error
* while creating the reader.
*
* @see org.openide.util.Lookup
*/
public static Reader getReader(Content content,
Lookup context) throws NoTextReaderFound {
try {
if (content instanceof AbstractFile) {
String mimeType = ((AbstractFile) content).getMIMEType();
List<TextExtractor> extractors = Arrays.asList(
new HtmlTextExtractor(content),
new SqliteTextExtractor(content),
new TikaTextExtractor(content));
for (TextExtractor extractor : extractors) {
extractor.setExtractionSettings(context);
if (extractor.isEnabled() && extractor.isSupported(content, mimeType)) {
return extractor.getReader();
}
}
} else if (content instanceof BlackboardArtifact) {
TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
artifactExtractor.setExtractionSettings(context);
return artifactExtractor.getReader();
} else if (content instanceof Report) {
TextExtractor reportExtractor = new TikaTextExtractor(content);
reportExtractor.setExtractionSettings(context);
return reportExtractor.getReader();
}
} catch (ExtractionException ex) {
throw new NoTextReaderFound("Error while getting reader", ex);
}
throw new NoTextReaderFound(
String.format("Could not find a suitable reader for "
+ "content with name [%s] and id=[%d]. Try using "
+ "the default reader instead.",
content.getName(), content.getId())
);
}
/**
* Returns a reader containing the Content text.
*
* @param content Content instance that will be read from
*
* @return Reader containing file text
*
* @throws NoTextReaderFound Encountered when there is no Reader was found
* for the given content type. Use
* getStringsReader(Content,Lookup) method
* instead.
*/
public static Reader getReader(Content content)
throws NoTextReaderFound {
return TextReaders.getReader(content, null);
}
/**
* Returns a Reader containing the Content strings. This method supports all
* content types. This method should be used as a backup in the event that
* no reader was found using getReader(Content) or getReader(Content,
* Lookup).
*
* Configure this reader with the StringsConfig in
* {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs}
*
* @param content Content source to read from
* @param context Contains extraction configurations for certain file types
*
* @return Reader containing file text
*
* @see org.openide.util.Lookup
*/
public static Reader getStringsReader(Content content, Lookup context) {
StringsTextExtractor stringsInstance = new StringsTextExtractor(content);
stringsInstance.setExtractionSettings(context);
return stringsInstance.getReader();
}
/**
* System level exception for handling content types that have no specific
* strategy defined for extracting their text.
*/
public static class NoTextReaderFound extends Exception {
public NoTextReaderFound(String msg) {
super(msg);
}
public NoTextReaderFound(Throwable ex) {
super(ex);
}
private NoTextReaderFound(String msg, Throwable ex) {
super(msg, ex);
}
}
}

View File

@ -25,6 +25,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level; import java.util.logging.Level;
import org.openide.util.Exceptions;
import org.openide.util.Lookup; import org.openide.util.Lookup;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages; import org.openide.util.NbBundle.Messages;
@ -44,9 +45,10 @@ import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorEx
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textreaders.TextReaders; import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig; import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig; import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig;
import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskData; import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown; import org.sleuthkit.datamodel.TskData.FileKnown;
@ -480,10 +482,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
Lookup extractionContext = Lookups.fixed(imageConfig); Lookup extractionContext = Lookups.fixed(imageConfig);
try { try {
Reader specializedReader = TextReaders.getReader(aFile,extractionContext); TextExtractor extractor = TextExtractorFactory.getExtractor(aFile,extractionContext);
Reader extractedTextReader = extractor.getReader();
//divide into chunks and index //divide into chunks and index
return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context); return Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, context);
} catch (TextReaders.NoTextReaderFound ex) { } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
//No text extractor found... run the default instead //No text extractor found... run the default instead
return false; return false;
} }
@ -502,8 +505,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) { if (context.fileIngestIsCancelled()) {
return true; return true;
} }
Reader stringsReader = TextReaders.getStringsReader(aFile, stringsExtractionContext); TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) { Reader extractedTextReader = stringsExtractor.getReader();
if (Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED); putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
return true; return true;
} else { } else {
@ -511,7 +515,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT); putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
return false; return false;
} }
} catch (IngesterException ex) { } catch (IngesterException | TextExtractor.ExtractionException ex) {
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING); putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
return false; return false;

View File

@ -46,7 +46,8 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
import org.sleuthkit.autopsy.progress.ProgressIndicator; import org.sleuthkit.autopsy.progress.ProgressIndicator;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException; import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.textreaders.TextReaders; import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
@ -114,22 +115,26 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
return; return;
} }
try { try {
Reader blackboardReader = TextReaders.getReader(content, null); TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor(content, null);
Reader blackboardExtractedTextReader = blackboardExtractor.getReader();
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID(); String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
ingester.indexMetaDataOnly(artifact, sourceName); ingester.indexMetaDataOnly(artifact, sourceName);
ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null); ingester.indexText(blackboardExtractedTextReader, artifact.getArtifactID(), sourceName, content, null);
} catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) { } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }
} else { } else {
try { try {
Reader contentReader = TextReaders.getReader(content, null); TextExtractor contentExtractor = TextExtractorFactory.getExtractor(content, null);
ingester.indexText(contentReader, content.getId(), content.getName(), content, null); Reader contentExtractedTextReader = contentExtractor.getReader();
} catch (TextReaders.NoTextReaderFound | Ingester.IngesterException ex) { ingester.indexText(contentExtractedTextReader, content.getId(), content.getName(), content, null);
} catch (TextExtractorFactory.NoTextExtractorFound | Ingester.IngesterException | TextExtractor.ExtractionException ex) {
try { try {
// Try the StringsTextExtractor if Tika extractions fails. // Try the StringsTextExtractor if Tika extractions fails.
ingester.indexText(TextReaders.getStringsReader(content, null),content.getId(),content.getName(), content, null); TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(content, null);
} catch (Ingester.IngesterException ex1) { Reader stringsExtractedTextReader = stringsExtractor.getReader();
ingester.indexText(stringsExtractedTextReader,content.getId(),content.getName(), content, null);
} catch (Ingester.IngesterException | TextExtractor.ExtractionException ex1) {
throw new TskCoreException(ex.getCause().getMessage(), ex1); throw new TskCoreException(ex.getCause().getMessage(), ex1);
} }
} }
@ -443,10 +448,11 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
try { try {
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID(); String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
Reader contentSpecificReader = TextReaders.getReader((Content) artifact, null); TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor((Content) artifact, null);
Reader blackboardExtractedTextReader = blackboardExtractor.getReader();
ingester.indexMetaDataOnly(artifact, sourceName); ingester.indexMetaDataOnly(artifact, sourceName);
ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null); ingester.indexText(blackboardExtractedTextReader, artifact.getId(), sourceName, artifact, null);
} catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) { } catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex); throw new TskCoreException(ex.getCause().getMessage(), ex);
} }
} }