mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Renamed TextExtractor public API
This commit is contained in:
parent
55f7003246
commit
1e1e166f55
@ -338,8 +338,8 @@
|
||||
<package>org.sleuthkit.autopsy.modules.vmextractor</package>
|
||||
<package>org.sleuthkit.autopsy.progress</package>
|
||||
<package>org.sleuthkit.autopsy.report</package>
|
||||
<package>org.sleuthkit.autopsy.textreaders</package>
|
||||
<package>org.sleuthkit.autopsy.textreaders.textreaderconfigs</package>
|
||||
<package>org.sleuthkit.autopsy.textextractors</package>
|
||||
<package>org.sleuthkit.autopsy.textextractors.textextractorconfigs</package>
|
||||
<package>org.sleuthkit.autopsy.texttranslation</package>
|
||||
<package>org.sleuthkit.datamodel</package>
|
||||
</public-packages>
|
||||
|
@ -16,7 +16,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders;
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
@ -83,7 +83,7 @@ class ArtifactTextExtractor extends TextExtractor {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported(Content file, String detectedFormat) {
|
||||
public boolean isSupported() {
|
||||
return true;
|
||||
}
|
||||
}
|
@ -16,7 +16,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders;
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
@ -32,6 +32,7 @@ import net.htmlparser.jericho.Source;
|
||||
import net.htmlparser.jericho.StartTag;
|
||||
import net.htmlparser.jericho.StartTagType;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
@ -42,7 +43,7 @@ final class HtmlTextExtractor extends TextExtractor {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||
private final int MAX_SIZE;
|
||||
private final Content file;
|
||||
private final AbstractFile file;
|
||||
|
||||
static final List<String> WEB_MIME_TYPES = Arrays.asList(
|
||||
"application/javascript", //NON-NLS
|
||||
@ -62,7 +63,7 @@ final class HtmlTextExtractor extends TextExtractor {
|
||||
* Creates a default instance of the HtmlTextExtractor. Supported file size
|
||||
* is 50MB.
|
||||
*/
|
||||
public HtmlTextExtractor(Content file) {
|
||||
public HtmlTextExtractor(AbstractFile file) {
|
||||
//Set default to be 50 MB.
|
||||
MAX_SIZE = 50_000_000;
|
||||
this.file = file;
|
||||
@ -77,10 +78,10 @@ final class HtmlTextExtractor extends TextExtractor {
|
||||
* @return flag indicating support
|
||||
*/
|
||||
@Override
|
||||
public boolean isSupported(Content content, String detectedFormat) {
|
||||
return detectedFormat != null
|
||||
&& WEB_MIME_TYPES.contains(detectedFormat)
|
||||
&& content.getSize() <= MAX_SIZE;
|
||||
public boolean isSupported() {
|
||||
return file.getMIMEType() != null
|
||||
&& WEB_MIME_TYPES.contains(file.getMIMEType())
|
||||
&& file.getSize() <= MAX_SIZE;
|
||||
}
|
||||
|
||||
/**
|
@ -16,7 +16,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders;
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
@ -28,7 +28,6 @@ import org.sleuthkit.autopsy.coreutils.SQLiteTableReaderException;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.SQLiteTableReader;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
|
||||
/**
|
||||
* Extracts text from SQLite database files.
|
||||
@ -45,8 +44,8 @@ final class SqliteTextExtractor extends TextExtractor {
|
||||
private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
|
||||
private final AbstractFile file;
|
||||
|
||||
public SqliteTextExtractor(Content file) {
|
||||
this.file = (AbstractFile) file;
|
||||
public SqliteTextExtractor(AbstractFile file) {
|
||||
this.file = file;
|
||||
}
|
||||
/**
|
||||
* Supports only the sqlite mimetypes
|
||||
@ -57,8 +56,8 @@ final class SqliteTextExtractor extends TextExtractor {
|
||||
* @return true if x-sqlite3
|
||||
*/
|
||||
@Override
|
||||
public boolean isSupported(Content file, String detectedFormat) {
|
||||
return SQLITE_MIMETYPE.equals(detectedFormat);
|
||||
public boolean isSupported() {
|
||||
return SQLITE_MIMETYPE.equals(file.getMIMEType());
|
||||
}
|
||||
|
||||
/**
|
@ -16,7 +16,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders;
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
@ -28,7 +28,7 @@ import java.util.Objects;
|
||||
import org.openide.util.Lookup;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig;
|
||||
import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
import org.sleuthkit.datamodel.TskException;
|
||||
@ -36,7 +36,7 @@ import org.sleuthkit.datamodel.TskException;
|
||||
/**
|
||||
* Extracts raw strings from content.
|
||||
*/
|
||||
final class StringsTextExtractor {
|
||||
final class StringsTextExtractor extends TextExtractor {
|
||||
|
||||
private boolean extractUTF8;
|
||||
private boolean extractUTF16;
|
||||
@ -81,6 +81,7 @@ final class StringsTextExtractor {
|
||||
* @throws
|
||||
* org.sleuthkit.autopsy.textextractors.TextExtractor.TextExtractorException
|
||||
*/
|
||||
@Override
|
||||
public InputStreamReader getReader() {
|
||||
InputStream stringStream = getInputStream(content);
|
||||
return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
|
||||
@ -104,6 +105,7 @@ final class StringsTextExtractor {
|
||||
*
|
||||
* @param context Lookup instance containing config classes
|
||||
*/
|
||||
@Override
|
||||
public void setExtractionSettings(Lookup context) {
|
||||
if (context != null) {
|
||||
StringsConfig configInstance = context.lookup(StringsConfig.class);
|
||||
@ -126,14 +128,11 @@ final class StringsTextExtractor {
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean isEnabled() {
|
||||
@Override
|
||||
public boolean isSupported() {
|
||||
return extractUTF8 || extractUTF16;
|
||||
}
|
||||
|
||||
boolean isSupported(Content file, String detectedFormat) {
|
||||
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
||||
}
|
||||
|
||||
/**
|
||||
* Content input string stream reader/converter - given Content, extract
|
||||
* strings from it and return encoded bytes via read()
|
@ -16,38 +16,26 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders;
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.io.Reader;
|
||||
import org.openide.util.Lookup;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
|
||||
/**
|
||||
* Extracts the text out of Content instances and exposes them as a Reader.
|
||||
* Concrete implementations can be obtained from
|
||||
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}
|
||||
*/
|
||||
abstract class TextExtractor {
|
||||
public abstract class TextExtractor {
|
||||
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor.
|
||||
* Determines if this extractor supports the given Content and
|
||||
* configurations passed into it in
|
||||
* {@link org.sleuthkit.autopsy.textextractors.TextExtractorFactory}.
|
||||
*
|
||||
* @param file to test if its content should be supported
|
||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||
* or null if not detected
|
||||
*
|
||||
* @return true if the file content is supported, false otherwise
|
||||
* @return true if content is supported, false otherwise
|
||||
*/
|
||||
abstract boolean isSupported(Content file, String detectedFormat);
|
||||
|
||||
/**
|
||||
* Determines if the TextExtractor instance is enabled to read content.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
boolean isEnabled() {
|
||||
return true;
|
||||
}
|
||||
abstract boolean isSupported();
|
||||
|
||||
/**
|
||||
* Get a Reader that will iterate over the text extracted from the Content
|
||||
@ -75,8 +63,7 @@ abstract class TextExtractor {
|
||||
}
|
||||
|
||||
/**
|
||||
* Exception encountered during
|
||||
* {@link org.sleuthkit.autopsy.textextractors.TextExtractor#getReader()}.
|
||||
* Exception encountered during TextExtractor.getReader().
|
||||
* This indicates that there was an internal parsing error that occurred
|
||||
* during the reading of Content text.
|
||||
*/
|
160
Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
Executable file
160
Core/src/org/sleuthkit/autopsy/textextractors/TextExtractorFactory.java
Executable file
@ -0,0 +1,160 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2018-2018 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.openide.util.Lookup;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.Report;
|
||||
|
||||
/**
|
||||
* Factory for creating TextExtractors given a Content instance
|
||||
*
|
||||
* See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for
|
||||
* available extractor configuration options.
|
||||
*
|
||||
* @see org.openide.util.Lookup
|
||||
*/
|
||||
public class TextExtractorFactory {
|
||||
|
||||
/**
|
||||
* Returns a TextExtractor containing the Content text. Configuration files
|
||||
* can be added to the Lookup.
|
||||
*
|
||||
* See {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs} for
|
||||
* available extractor configuration options.
|
||||
*
|
||||
* @param content Content source that will be read from
|
||||
* @param context Contains extraction configurations for certain file types
|
||||
*
|
||||
* @return TextExtractor containing file text
|
||||
*
|
||||
* @throws NoTextExtractorFound Encountered when there is no Reader found
|
||||
* for the given content type or there was an
|
||||
* error while creating the reader.
|
||||
*
|
||||
* @see org.openide.util.Lookup
|
||||
*/
|
||||
public static TextExtractor getExtractor(Content content, Lookup context) throws NoTextExtractorFound {
|
||||
if (content instanceof AbstractFile) {
|
||||
for (TextExtractor extractor : getFileExtractors((AbstractFile) content, context)) {
|
||||
if (extractor.isSupported()) {
|
||||
return extractor;
|
||||
}
|
||||
}
|
||||
} else if (content instanceof BlackboardArtifact) {
|
||||
TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
|
||||
artifactExtractor.setExtractionSettings(context);
|
||||
return artifactExtractor;
|
||||
} else if (content instanceof Report) {
|
||||
TextExtractor reportExtractor = new TikaTextExtractor(content);
|
||||
reportExtractor.setExtractionSettings(context);
|
||||
return reportExtractor;
|
||||
}
|
||||
|
||||
throw new NoTextExtractorFound(
|
||||
String.format("Could not find a suitable reader for "
|
||||
+ "content with name [%s] and id=[%d]. Try using "
|
||||
+ "the strings extractor instead.",
|
||||
content.getName(), content.getId())
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes, orders, and returns all file extractors that can read
|
||||
* AbstractFile instances.
|
||||
*
|
||||
* @param content AbstractFile content
|
||||
* @param context Lookup containing extractor configurations
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
|
||||
List<TextExtractor> fileExtractors = Arrays.asList(
|
||||
new HtmlTextExtractor(content),
|
||||
new SqliteTextExtractor(content),
|
||||
new TikaTextExtractor(content));
|
||||
|
||||
fileExtractors.forEach((fileExtractor) -> {
|
||||
fileExtractor.setExtractionSettings(context);
|
||||
});
|
||||
|
||||
return fileExtractors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a TextExtractor containing the Content text.
|
||||
*
|
||||
* @param content Content instance that will be read from
|
||||
*
|
||||
* @return TextExtractor containing file text
|
||||
*
|
||||
* @throws NoTextExtractorFound Encountered when there is no Reader was
|
||||
* found for the given content type. Use
|
||||
* getStringsExtractor(Content,Lookup) method
|
||||
* instead.
|
||||
*/
|
||||
public static TextExtractor getExtractor(Content content) throws NoTextExtractorFound {
|
||||
return TextExtractorFactory.getExtractor(content, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a TextExtractor containing the Content strings. This method
|
||||
* supports all content types. This method should be used as a backup in the
|
||||
* event that no reader was found using getExtractor(Content) or
|
||||
* getExtractor(Content, Lookup).
|
||||
*
|
||||
* Configure this extractor with the StringsConfig in
|
||||
* {@link org.sleuthkit.autopsy.textextractors.textextractorconfigs}
|
||||
*
|
||||
* @param content Content source to read from
|
||||
* @param context Contains extraction configurations for certain file types
|
||||
*
|
||||
* @return TextExtractor containing file text
|
||||
*
|
||||
* @see org.openide.util.Lookup
|
||||
*/
|
||||
public static TextExtractor getStringsExtractor(Content content, Lookup context) {
|
||||
StringsTextExtractor stringsInstance = new StringsTextExtractor(content);
|
||||
stringsInstance.setExtractionSettings(context);
|
||||
return stringsInstance;
|
||||
}
|
||||
|
||||
/**
|
||||
* System level exception for handling content types that have no specific
|
||||
* strategy defined for extracting their text.
|
||||
*/
|
||||
public static class NoTextExtractorFound extends Exception {
|
||||
|
||||
public NoTextExtractorFound(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
public NoTextExtractorFound(Throwable ex) {
|
||||
super(ex);
|
||||
}
|
||||
|
||||
private NoTextExtractorFound(String msg, Throwable ex) {
|
||||
super(msg, ex);
|
||||
}
|
||||
}
|
||||
}
|
@ -16,7 +16,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders;
|
||||
package org.sleuthkit.autopsy.textextractors;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.io.CharSource;
|
||||
@ -61,7 +61,7 @@ import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
|
||||
import org.sleuthkit.autopsy.coreutils.ExecUtil;
|
||||
import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
|
||||
import org.sleuthkit.autopsy.coreutils.PlatformUtil;
|
||||
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig;
|
||||
import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig;
|
||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
@ -422,24 +422,27 @@ final class TikaTextExtractor extends TextExtractor {
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if Tika is supported for this content type and mimetype.
|
||||
*
|
||||
* @param content Source content to read
|
||||
* @param detectedFormat Mimetype of content
|
||||
* Determines if Tika is enabled for this content
|
||||
*
|
||||
* @return Flag indicating support for reading content type
|
||||
*/
|
||||
@Override
|
||||
public boolean isSupported(Content content, String detectedFormat) {
|
||||
if (detectedFormat == null
|
||||
|| BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||
|| ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||
|| detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
|
||||
public boolean isSupported() {
|
||||
if(!(content instanceof AbstractFile)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String detectedType = ((AbstractFile)content).getMIMEType();
|
||||
if (detectedType == null
|
||||
|| BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
|
||||
|| ARCHIVE_MIME_TYPES.contains(detectedType)
|
||||
|| (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||
|| detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||
|
||||
return TIKA_SUPPORTED_TYPES.contains(detectedType);
|
||||
}
|
||||
|
||||
/**
|
@ -16,11 +16,11 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders.textreaderconfigs;
|
||||
package org.sleuthkit.autopsy.textextractors.textextractorconfigs;
|
||||
|
||||
/**
|
||||
* Allows for configuration of OCR on image files. Readers that use ImageConfig
|
||||
* can be obtained through {@link org.sleuthkit.autopsy.textreaders.TextReaders}
|
||||
* Allows for configuration of OCR on image files. Extractors that use ImageConfig
|
||||
* can be obtained through TextExtractoryFactory.getExtractor().
|
||||
*
|
||||
* @see org.openide.util.Lookup
|
||||
*/
|
@ -16,20 +16,20 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders.textreaderconfigs;
|
||||
package org.sleuthkit.autopsy.textextractors.textextractorconfigs;
|
||||
|
||||
import java.util.List;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
|
||||
/**
|
||||
* Allows for configuration of the Reader obtained from
|
||||
* {@link org.sleuthkit.autopsy.textreaders.TextReader#getStringsReader(Content, Lookup)}.
|
||||
* Allows for configuration of the TextExtractor obtained from
|
||||
* TextExtractorFactory.getExtractor().
|
||||
*
|
||||
* The strings reader will read strings from the Content instance. This class
|
||||
* The strings extractor will extract strings from the Content instance. This class
|
||||
* allows for the configuration of the encoding and language scripts used during
|
||||
* reading.
|
||||
*
|
||||
* @see org.sleuthkit.autopsy.textreaders.TextReaders
|
||||
* @see org.sleuthkit.autopsy.textextractors.TextExtractorFactory
|
||||
* @see
|
||||
* org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
|
||||
* @see org.openide.util.Lookup
|
||||
@ -77,8 +77,8 @@ public class StringsConfig {
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the type of language scripts that will be used during this
|
||||
* reading. See
|
||||
* Sets the type of language scripts that will be used during this reading.
|
||||
* See
|
||||
* {@link org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT}
|
||||
* for more information about available scripts.
|
||||
*
|
@ -1,152 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2018-2018 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.textreaders;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.openide.util.Lookup;
|
||||
import org.sleuthkit.autopsy.textreaders.TextExtractor.ExtractionException;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.Report;
|
||||
|
||||
/**
|
||||
* Factory for creating Readers given a Content instance
|
||||
*
|
||||
* See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for available
|
||||
* Reader configuration options.
|
||||
*
|
||||
* @see org.openide.util.Lookup
|
||||
*/
|
||||
public class TextReaders {
|
||||
|
||||
/**
|
||||
* Returns a reader containing the Content text. Configuration files can be
|
||||
* added to the Lookup.
|
||||
*
|
||||
* See {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs} for
|
||||
* available Reader configuration options.
|
||||
*
|
||||
* @param content Content source that will be read from
|
||||
* @param context Contains extraction configurations for certain file types
|
||||
*
|
||||
* @return Reader containing file text
|
||||
*
|
||||
* @throws NoTextReaderFound Encountered when there is no Reader found for
|
||||
* the given content type or there was an error
|
||||
* while creating the reader.
|
||||
*
|
||||
* @see org.openide.util.Lookup
|
||||
*/
|
||||
public static Reader getReader(Content content,
|
||||
Lookup context) throws NoTextReaderFound {
|
||||
try {
|
||||
if (content instanceof AbstractFile) {
|
||||
String mimeType = ((AbstractFile) content).getMIMEType();
|
||||
List<TextExtractor> extractors = Arrays.asList(
|
||||
new HtmlTextExtractor(content),
|
||||
new SqliteTextExtractor(content),
|
||||
new TikaTextExtractor(content));
|
||||
for (TextExtractor extractor : extractors) {
|
||||
extractor.setExtractionSettings(context);
|
||||
if (extractor.isEnabled() && extractor.isSupported(content, mimeType)) {
|
||||
return extractor.getReader();
|
||||
}
|
||||
}
|
||||
} else if (content instanceof BlackboardArtifact) {
|
||||
TextExtractor artifactExtractor = new ArtifactTextExtractor((BlackboardArtifact) content);
|
||||
artifactExtractor.setExtractionSettings(context);
|
||||
return artifactExtractor.getReader();
|
||||
} else if (content instanceof Report) {
|
||||
TextExtractor reportExtractor = new TikaTextExtractor(content);
|
||||
reportExtractor.setExtractionSettings(context);
|
||||
return reportExtractor.getReader();
|
||||
}
|
||||
} catch (ExtractionException ex) {
|
||||
throw new NoTextReaderFound("Error while getting reader", ex);
|
||||
}
|
||||
|
||||
throw new NoTextReaderFound(
|
||||
String.format("Could not find a suitable reader for "
|
||||
+ "content with name [%s] and id=[%d]. Try using "
|
||||
+ "the default reader instead.",
|
||||
content.getName(), content.getId())
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reader containing the Content text.
|
||||
*
|
||||
* @param content Content instance that will be read from
|
||||
*
|
||||
* @return Reader containing file text
|
||||
*
|
||||
* @throws NoTextReaderFound Encountered when there is no Reader was found
|
||||
* for the given content type. Use
|
||||
* getStringsReader(Content,Lookup) method
|
||||
* instead.
|
||||
*/
|
||||
public static Reader getReader(Content content)
|
||||
throws NoTextReaderFound {
|
||||
return TextReaders.getReader(content, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Reader containing the Content strings. This method supports all
|
||||
* content types. This method should be used as a backup in the event that
|
||||
* no reader was found using getReader(Content) or getReader(Content,
|
||||
* Lookup).
|
||||
*
|
||||
* Configure this reader with the StringsConfig in
|
||||
* {@link org.sleuthkit.autopsy.textreaders.textreaderconfigs}
|
||||
*
|
||||
* @param content Content source to read from
|
||||
* @param context Contains extraction configurations for certain file types
|
||||
*
|
||||
* @return Reader containing file text
|
||||
*
|
||||
* @see org.openide.util.Lookup
|
||||
*/
|
||||
public static Reader getStringsReader(Content content, Lookup context) {
|
||||
StringsTextExtractor stringsInstance = new StringsTextExtractor(content);
|
||||
stringsInstance.setExtractionSettings(context);
|
||||
return stringsInstance.getReader();
|
||||
}
|
||||
|
||||
/**
|
||||
* System level exception for handling content types that have no specific
|
||||
* strategy defined for extracting their text.
|
||||
*/
|
||||
public static class NoTextReaderFound extends Exception {
|
||||
|
||||
public NoTextReaderFound(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
public NoTextReaderFound(Throwable ex) {
|
||||
super(ex);
|
||||
}
|
||||
|
||||
private NoTextReaderFound(String msg, Throwable ex) {
|
||||
super(msg, ex);
|
||||
}
|
||||
}
|
||||
}
|
@ -25,6 +25,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.logging.Level;
|
||||
import org.openide.util.Exceptions;
|
||||
import org.openide.util.Lookup;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.NbBundle.Messages;
|
||||
@ -44,9 +45,10 @@ import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorEx
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||
import org.sleuthkit.autopsy.textreaders.TextReaders;
|
||||
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.ImageConfig;
|
||||
import org.sleuthkit.autopsy.textreaders.textreaderconfigs.StringsConfig;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||
import org.sleuthkit.autopsy.textextractors.textextractorconfigs.ImageConfig;
|
||||
import org.sleuthkit.autopsy.textextractors.textextractorconfigs.StringsConfig;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.TskData;
|
||||
import org.sleuthkit.datamodel.TskData.FileKnown;
|
||||
@ -480,10 +482,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
Lookup extractionContext = Lookups.fixed(imageConfig);
|
||||
|
||||
try {
|
||||
Reader specializedReader = TextReaders.getReader(aFile,extractionContext);
|
||||
TextExtractor extractor = TextExtractorFactory.getExtractor(aFile,extractionContext);
|
||||
Reader extractedTextReader = extractor.getReader();
|
||||
//divide into chunks and index
|
||||
return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
|
||||
} catch (TextReaders.NoTextReaderFound ex) {
|
||||
return Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, context);
|
||||
} catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
|
||||
//No text extractor found... run the default instead
|
||||
return false;
|
||||
}
|
||||
@ -502,8 +505,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return true;
|
||||
}
|
||||
Reader stringsReader = TextReaders.getStringsReader(aFile, stringsExtractionContext);
|
||||
if (Ingester.getDefault().indexText(stringsReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
|
||||
TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
|
||||
Reader extractedTextReader = stringsExtractor.getReader();
|
||||
if (Ingester.getDefault().indexText(extractedTextReader,aFile.getId(),aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
||||
return true;
|
||||
} else {
|
||||
@ -511,7 +515,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
|
||||
return false;
|
||||
}
|
||||
} catch (IngesterException ex) {
|
||||
} catch (IngesterException | TextExtractor.ExtractionException ex) {
|
||||
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
|
||||
return false;
|
||||
|
@ -46,7 +46,8 @@ import org.sleuthkit.autopsy.appservices.AutopsyService;
|
||||
import org.sleuthkit.autopsy.progress.ProgressIndicator;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||
import org.sleuthkit.autopsy.textreaders.TextReaders;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
@ -114,22 +115,26 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Reader blackboardReader = TextReaders.getReader(content, null);
|
||||
TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor(content, null);
|
||||
Reader blackboardExtractedTextReader = blackboardExtractor.getReader();
|
||||
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||
ingester.indexText(blackboardReader, artifact.getArtifactID(), sourceName, content, null);
|
||||
} catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) {
|
||||
ingester.indexText(blackboardExtractedTextReader, artifact.getArtifactID(), sourceName, content, null);
|
||||
} catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
Reader contentReader = TextReaders.getReader(content, null);
|
||||
ingester.indexText(contentReader, content.getId(), content.getName(), content, null);
|
||||
} catch (TextReaders.NoTextReaderFound | Ingester.IngesterException ex) {
|
||||
TextExtractor contentExtractor = TextExtractorFactory.getExtractor(content, null);
|
||||
Reader contentExtractedTextReader = contentExtractor.getReader();
|
||||
ingester.indexText(contentExtractedTextReader, content.getId(), content.getName(), content, null);
|
||||
} catch (TextExtractorFactory.NoTextExtractorFound | Ingester.IngesterException | TextExtractor.ExtractionException ex) {
|
||||
try {
|
||||
// Try the StringsTextExtractor if Tika extractions fails.
|
||||
ingester.indexText(TextReaders.getStringsReader(content, null),content.getId(),content.getName(), content, null);
|
||||
} catch (Ingester.IngesterException ex1) {
|
||||
TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(content, null);
|
||||
Reader stringsExtractedTextReader = stringsExtractor.getReader();
|
||||
ingester.indexText(stringsExtractedTextReader,content.getId(),content.getName(), content, null);
|
||||
} catch (Ingester.IngesterException | TextExtractor.ExtractionException ex1) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex1);
|
||||
}
|
||||
}
|
||||
@ -443,10 +448,11 @@ public class SolrSearchService implements KeywordSearchService, AutopsyService {
|
||||
|
||||
try {
|
||||
String sourceName = artifact.getDisplayName() + "_" + artifact.getArtifactID();
|
||||
Reader contentSpecificReader = TextReaders.getReader((Content) artifact, null);
|
||||
TextExtractor blackboardExtractor = TextExtractorFactory.getExtractor((Content) artifact, null);
|
||||
Reader blackboardExtractedTextReader = blackboardExtractor.getReader();
|
||||
ingester.indexMetaDataOnly(artifact, sourceName);
|
||||
ingester.indexText(contentSpecificReader, artifact.getId(), sourceName, artifact, null);
|
||||
} catch (Ingester.IngesterException | TextReaders.NoTextReaderFound ex) {
|
||||
ingester.indexText(blackboardExtractedTextReader, artifact.getId(), sourceName, artifact, null);
|
||||
} catch (Ingester.IngesterException | TextExtractorFactory.NoTextExtractorFound | TextExtractor.ExtractionException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user