mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-12 16:06:15 +00:00
Generalize text extractors more so we support multiple extractors in keyword search that are ordered from more to less specific ones.
Integrate html text extractor into keyword search.
This commit is contained in:
parent
ca87852431
commit
27e04f16d1
@ -41,8 +41,24 @@ interface AbstractFileExtract {
|
||||
|
||||
/**
|
||||
* Index the Abstract File
|
||||
* @param sourceFile file to index
|
||||
* @return true if indexed successfully, false otherwise
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
boolean index() throws Ingester.IngesterException;
|
||||
boolean index(AbstractFile sourceFile) throws Ingester.IngesterException;
|
||||
|
||||
/**
|
||||
* Determines if the extractor works only for specified types
|
||||
* is supportedTypes() or whether is a generic content extractor (such as string extractor)
|
||||
* @return
|
||||
*/
|
||||
boolean isContentTypeSpecific();
|
||||
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor,
|
||||
* if isContentTypeSpecific() returns true.
|
||||
* @param file to test if its content should be supported
|
||||
* @return true if the file content is supported, false otherwise
|
||||
*/
|
||||
boolean isSupported(AbstractFile file);
|
||||
}
|
||||
|
@ -46,12 +46,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
||||
private AbstractFile sourceFile;
|
||||
private int numChunks = 0;
|
||||
private static final String UTF16BOM = "\uFEFF";
|
||||
private static final String [] SUPPORTED_EXTENSIONS = {"htm", "html", "xhtml", "css", "js"};
|
||||
|
||||
AbstractFileHtmlExtract(AbstractFile sourceFile) {
|
||||
this.sourceFile = sourceFile;
|
||||
AbstractFileHtmlExtract() {
|
||||
this.service = KeywordSearchIngestService.getDefault();
|
||||
Server solrServer = KeywordSearch.getServer();
|
||||
ingester = solrServer.getIngester();
|
||||
ingester = Server.getIngester();
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -65,9 +64,13 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index() throws IngesterException {
|
||||
public boolean index(AbstractFile sourceFile) throws IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
this.numChunks = 0; //unknown until indexing is done
|
||||
|
||||
boolean success = false;
|
||||
Reader reader = null;
|
||||
|
||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||
|
||||
try {
|
||||
@ -173,4 +176,20 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
||||
return success;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file) {
|
||||
String fileNameLower = file.getName().toLowerCase();
|
||||
for (int i = 0; i< SUPPORTED_EXTENSIONS.length; ++i) {
|
||||
if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -40,24 +40,27 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
||||
private int numChunks;
|
||||
private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
|
||||
static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
||||
private AbstractFile aFile;
|
||||
private AbstractFile sourceFile;
|
||||
//single static buffer for all extractions. Safe, indexing can only happen in one thread
|
||||
private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
||||
private static final int BOM_LEN = 3;
|
||||
|
||||
//private static final StringExtract se = new StringExtract();
|
||||
|
||||
static {
|
||||
//prepend UTF-8 BOM to start of the buffer
|
||||
STRING_CHUNK_BUF[0] = (byte) 0xEF;
|
||||
STRING_CHUNK_BUF[1] = (byte) 0xBB;
|
||||
STRING_CHUNK_BUF[2] = (byte) 0xBF;
|
||||
|
||||
//se.init();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public AbstractFileStringExtract(AbstractFile aFile) {
|
||||
this.aFile = aFile;
|
||||
numChunks = 0; //unknown until indexing is done
|
||||
public AbstractFileStringExtract() {
|
||||
this.service = KeywordSearchIngestService.getDefault();
|
||||
Server solrServer = KeywordSearch.getServer();
|
||||
ingester = solrServer.getIngester();
|
||||
ingester = Server.getIngester();
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -67,15 +70,17 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
||||
|
||||
@Override
|
||||
public AbstractFile getSourceFile() {
|
||||
return aFile;
|
||||
return sourceFile;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index() throws IngesterException {
|
||||
public boolean index(AbstractFile sourceFile) throws IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
this.numChunks = 0; //unknown until indexing is done
|
||||
boolean success = false;
|
||||
|
||||
|
||||
//construct stream that extracts text as we read it
|
||||
final InputStream stringStream = new AbstractFileStringStream(aFile, ByteContentStream.Encoding.UTF8);
|
||||
final InputStream stringStream = new AbstractFileStringStream(sourceFile, ByteContentStream.Encoding.UTF8);
|
||||
|
||||
try {
|
||||
success = true;
|
||||
@ -93,7 +98,7 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
||||
++this.numChunks;
|
||||
} catch (IngesterException ingEx) {
|
||||
success = false;
|
||||
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ingEx);
|
||||
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
|
||||
throw ingEx; //need to rethrow/return to signal error and move on
|
||||
}
|
||||
|
||||
@ -109,19 +114,31 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
||||
ingester.ingest(this);
|
||||
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + aFile.getName(), ex);
|
||||
logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex);
|
||||
success = false;
|
||||
} finally {
|
||||
try {
|
||||
stringStream.close();
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Error closing input stream stream, file: " + aFile.getName(), ex);
|
||||
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -42,11 +42,12 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
|
||||
/**
|
||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||
* divided into chunks and indexed with Solr.
|
||||
* Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
||||
* If Tika extraction succeeds, chunks are indexed with Solr.
|
||||
* divided into chunks and indexed with Solr. Protects against Tika parser hangs
|
||||
* (for unexpected/corrupt content) using a timeout mechanism. If Tika
|
||||
* extraction succeeds, chunks are indexed with Solr.
|
||||
*
|
||||
* This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type.
|
||||
* This Tika extraction/chunking utility is useful for large files of Tika
|
||||
* parsers-supported content type.
|
||||
*
|
||||
*/
|
||||
public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
@ -58,19 +59,24 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
private static final int SINGLE_READ_CHARS = 1024;
|
||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
|
||||
private static final Tika tika = new Tika();
|
||||
private Tika tika;
|
||||
private KeywordSearchIngestService service;
|
||||
private Ingester ingester;
|
||||
private AbstractFile sourceFile;
|
||||
private static Ingester ingester;
|
||||
private AbstractFile sourceFile; //currently processed file
|
||||
private int numChunks = 0;
|
||||
private static final String UTF16BOM = "\uFEFF";
|
||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||
// TODO: use a more robust method than checking file extension
|
||||
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
|
||||
static final String[] SUPPORTED_EXTENSIONS = {"tar", "jar", "zip", "gzip", "bzip2",
|
||||
"gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
|
||||
"bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
|
||||
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
|
||||
|
||||
AbstractFileTikaTextExtract(AbstractFile sourceFile) {
|
||||
this.sourceFile = sourceFile;
|
||||
AbstractFileTikaTextExtract() {
|
||||
this.service = KeywordSearchIngestService.getDefault();
|
||||
Server solrServer = KeywordSearch.getServer();
|
||||
ingester = solrServer.getIngester();
|
||||
ingester = Server.getIngester();
|
||||
tika = new Tika();
|
||||
//tika.setMaxStringLength(MAX_EXTR_TEXT_CHARS); //for getting back string only
|
||||
}
|
||||
|
||||
@ -85,7 +91,10 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index() throws Ingester.IngesterException {
|
||||
public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
this.numChunks = 0; //unknown until indexing is done
|
||||
|
||||
boolean success = false;
|
||||
Reader reader = null;
|
||||
|
||||
@ -94,30 +103,30 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
try {
|
||||
Metadata meta = new Metadata();
|
||||
/* Tika parse request with timeout -- disabled for now
|
||||
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
||||
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
||||
try {
|
||||
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||
} catch (TimeoutException te) {
|
||||
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||
logger.log(Level.WARNING, msg);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||
logger.log(Level.WARNING, msg, ex);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
|
||||
reader = parseTask.getReader();
|
||||
*/
|
||||
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
||||
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
||||
try {
|
||||
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||
} catch (TimeoutException te) {
|
||||
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||
logger.log(Level.WARNING, msg);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||
logger.log(Level.WARNING, msg, ex);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
|
||||
reader = parseTask.getReader();
|
||||
*/
|
||||
try {
|
||||
reader = tika.parse(stream, meta);
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
||||
reader = null;
|
||||
}
|
||||
|
||||
|
||||
if (reader == null) {
|
||||
//likely due to exception in parse()
|
||||
logger.log(Level.WARNING, "No reader available from Tika parse");
|
||||
@ -230,8 +239,25 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
return success;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file) {
|
||||
String fileNameLower = file.getName().toLowerCase();
|
||||
for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) {
|
||||
if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runnable and timeable task that calls tika to parse the content using streaming
|
||||
* Runnable and timeable task that calls tika to parse the content using
|
||||
* streaming
|
||||
*/
|
||||
private static class ParseRequestTask implements Runnable {
|
||||
|
||||
|
@ -65,12 +65,6 @@ public class Ingester {
|
||||
private final ExecutorService upRequestExecutor = Executors.newSingleThreadExecutor();
|
||||
private final Server solrServer = KeywordSearch.getServer();
|
||||
private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
|
||||
// TODO: use a more robust method than checking file extension
|
||||
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
|
||||
static final String[] ingestibleExtensions = {"tar", "jar", "zip", "gzip", "bzip2",
|
||||
"gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
|
||||
"bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
|
||||
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
|
||||
|
||||
|
||||
private static Ingester instance;
|
||||
@ -452,29 +446,4 @@ public class Ingester {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if the file content is ingestible/indexable by keyword search
|
||||
* Ingestible abstract file is either a directory, or an allocated file with supported extensions.
|
||||
* Note: currently only checks by extension and abstract type, it does not check actual file content.
|
||||
* @param aFile
|
||||
* @return true if it is ingestible, false otherwise
|
||||
*/
|
||||
static boolean isIngestible(AbstractFile aFile) {
|
||||
TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
|
||||
if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
FsContent fsContent = (FsContent) aFile;
|
||||
|
||||
boolean isIngestible = false;
|
||||
final String fileName = fsContent.getName();
|
||||
for (final String ext : ingestibleExtensions) {
|
||||
if (fileName.toLowerCase().endsWith(ext)) {
|
||||
isIngestible = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isIngestible;
|
||||
}
|
||||
}
|
||||
|
@ -95,6 +95,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
||||
private SleuthkitCase caseHandle = null;
|
||||
private boolean skipKnown = true;
|
||||
private boolean initialized = false;
|
||||
private List<AbstractFileExtract> textExtractors;
|
||||
private AbstractFileStringExtract stringExtractor;
|
||||
|
||||
private enum IngestStatus {
|
||||
|
||||
@ -256,9 +258,15 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
||||
|
||||
this.managerProxy = managerProxy;
|
||||
|
||||
Server solrServer = KeywordSearch.getServer();
|
||||
ingester = Server.getIngester();
|
||||
|
||||
//initialize extractors
|
||||
stringExtractor = new AbstractFileStringExtract();
|
||||
textExtractors = new ArrayList<AbstractFileExtract>();
|
||||
//order matters, more specific extractors first
|
||||
textExtractors.add(new AbstractFileHtmlExtract());
|
||||
textExtractors.add(new AbstractFileTikaTextExtract());
|
||||
|
||||
ingester = solrServer.getIngester();
|
||||
|
||||
ingestStatus = new HashMap<Long, IngestStatus>();
|
||||
|
||||
@ -505,21 +513,44 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
||||
*
|
||||
* @param aFile file to extract strings from, divide into chunks and
|
||||
* index
|
||||
* @param stringsOnly true if use stinrg extraction, false if use Tika
|
||||
* text extractor
|
||||
* @param stringsOnly true if use string extraction, false if to use a
|
||||
* content-type specific text extractor
|
||||
* @return true if the file was indexed, false otherwise
|
||||
* @throws IngesterException exception thrown if indexing failed
|
||||
*/
|
||||
private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
|
||||
AbstractFileExtract fileExtract;
|
||||
AbstractFileExtract fileExtract = null;
|
||||
|
||||
if (stringsOnly) {
|
||||
fileExtract = new AbstractFileStringExtract(aFile);
|
||||
fileExtract = stringExtractor;
|
||||
} else {
|
||||
fileExtract = new AbstractFileTikaTextExtract(aFile);
|
||||
//go over available text extractors and pick the first one (most specific one)
|
||||
for (AbstractFileExtract fe : textExtractors) {
|
||||
if (fe.isSupported(aFile)) {
|
||||
fileExtract = fe;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fileExtract == null) {
|
||||
throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
|
||||
}
|
||||
|
||||
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
||||
|
||||
//divide into chunks and index
|
||||
return fileExtract.index();
|
||||
return fileExtract.index(aFile);
|
||||
}
|
||||
|
||||
private boolean isTextExtractSupported(AbstractFile aFile) {
|
||||
for (AbstractFileExtract extractor : textExtractors) {
|
||||
if (extractor.isContentTypeSpecific() == true
|
||||
&& extractor.isSupported(aFile)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void indexFile(AbstractFile aFile, boolean indexContent) {
|
||||
@ -547,11 +578,10 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
||||
return;
|
||||
}
|
||||
|
||||
boolean ingestibleFile = Ingester.isIngestible(aFile);
|
||||
|
||||
if (fsContent != null && ingestibleFile == true) {
|
||||
//we know it's an allocated fs file (FsContent) with supported content
|
||||
//extract text with Tika, divide into chunks and index with Solr
|
||||
boolean extractTextSupported = isTextExtractSupported(aFile);
|
||||
if (fsContent != null && extractTextSupported) {
|
||||
//we know it's an allocated FS file (since it's FsContent)
|
||||
//extract text with one of the extractors, divide into chunks and index with Solr
|
||||
try {
|
||||
//logger.log(Level.INFO, "indexing: " + fsContent.getName());
|
||||
if (!extractIndex(aFile, false)) {
|
||||
@ -564,7 +594,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
||||
|
||||
} else {
|
||||
ingestStatus.put(aFile.getId(), IngestStatus.INGESTED);
|
||||
|
||||
}
|
||||
|
||||
} catch (IngesterException e) {
|
||||
@ -715,13 +744,13 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
||||
|
||||
for (String termResult : queryResult.keySet()) {
|
||||
List<ContentHit> queryTermResults = queryResult.get(termResult);
|
||||
|
||||
|
||||
//translate to list of IDs that we keep track of
|
||||
List<Long> queryTermResultsIDs = new ArrayList<Long>();
|
||||
for (ContentHit ch : queryTermResults) {
|
||||
queryTermResultsIDs.add(ch.getId());
|
||||
}
|
||||
|
||||
|
||||
Keyword termResultK = new Keyword(termResult, !isRegex);
|
||||
List<Long> curTermResults = currentResults.get(termResultK);
|
||||
if (curTermResults == null) {
|
||||
@ -938,7 +967,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the skip known files setting on the service
|
||||
*
|
||||
|
Loading…
x
Reference in New Issue
Block a user