diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java index e1501a8d34..75f22283d3 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java @@ -41,8 +41,24 @@ interface AbstractFileExtract { /** * Index the Abstract File + * @param sourceFile file to index * @return true if indexed successfully, false otherwise * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - boolean index() throws Ingester.IngesterException; + boolean index(AbstractFile sourceFile) throws Ingester.IngesterException; + + /** + * Determines if the extractor works only for specified types + * is supportedTypes() or whether is a generic content extractor (such as string extractor) + * @return + */ + boolean isContentTypeSpecific(); + + /** + * Determines if the file content is supported by the extractor, + * if isContentTypeSpecific() returns true. + * @param file to test if its content should be supported + * @return true if the file content is supported, false otherwise + */ + boolean isSupported(AbstractFile file); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java index 3516e68b57..e97d420264 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java @@ -46,12 +46,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract { private AbstractFile sourceFile; private int numChunks = 0; private static final String UTF16BOM = "\uFEFF"; + private static final String [] SUPPORTED_EXTENSIONS = {"htm", "html", "xhtml", "css", "js"}; - AbstractFileHtmlExtract(AbstractFile sourceFile) { - this.sourceFile = sourceFile; + AbstractFileHtmlExtract() { this.service = KeywordSearchIngestService.getDefault(); - Server solrServer = KeywordSearch.getServer(); - ingester = solrServer.getIngester(); + ingester = Server.getIngester(); } @Override @@ -65,9 +64,13 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract { } @Override - public boolean index() throws IngesterException { + public boolean index(AbstractFile sourceFile) throws IngesterException { + this.sourceFile = sourceFile; + this.numChunks = 0; //unknown until indexing is done + boolean success = false; Reader reader = null; + final InputStream stream = new ReadContentInputStream(sourceFile); try { @@ -173,4 +176,20 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract { return success; } + @Override + public boolean isContentTypeSpecific() { + return true; + } + + @Override + public boolean isSupported(AbstractFile file) { + String fileNameLower = file.getName().toLowerCase(); + for (int i = 0; i< SUPPORTED_EXTENSIONS.length; ++i) { + if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) { + return true; + } + } + return false; + } + } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java index 72b30e49d7..565a8c37a6 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java @@ -40,24 +40,27 @@ class AbstractFileStringExtract implements AbstractFileExtract { private int numChunks; private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName()); static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; - private AbstractFile aFile; + private AbstractFile sourceFile; //single static buffer for all extractions. Safe, indexing can only happen in one thread private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE]; private static final int BOM_LEN = 3; + + //private static final StringExtract se = new StringExtract(); static { //prepend UTF-8 BOM to start of the buffer STRING_CHUNK_BUF[0] = (byte) 0xEF; STRING_CHUNK_BUF[1] = (byte) 0xBB; STRING_CHUNK_BUF[2] = (byte) 0xBF; + + //se.init(); + + } - public AbstractFileStringExtract(AbstractFile aFile) { - this.aFile = aFile; - numChunks = 0; //unknown until indexing is done + public AbstractFileStringExtract() { this.service = KeywordSearchIngestService.getDefault(); - Server solrServer = KeywordSearch.getServer(); - ingester = solrServer.getIngester(); + ingester = Server.getIngester(); } @Override @@ -67,15 +70,17 @@ class AbstractFileStringExtract implements AbstractFileExtract { @Override public AbstractFile getSourceFile() { - return aFile; + return sourceFile; } @Override - public boolean index() throws IngesterException { + public boolean index(AbstractFile sourceFile) throws IngesterException { + this.sourceFile = sourceFile; + this.numChunks = 0; //unknown until indexing is done boolean success = false; - + //construct stream that extracts text as we read it - final InputStream stringStream = new AbstractFileStringStream(aFile, ByteContentStream.Encoding.UTF8); + final InputStream stringStream = new AbstractFileStringStream(sourceFile, ByteContentStream.Encoding.UTF8); try { success = true; @@ -93,7 +98,7 @@ class AbstractFileStringExtract implements AbstractFileExtract { ++this.numChunks; } catch (IngesterException ingEx) { success = false; - logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ingEx); + logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); throw ingEx; //need to rethrow/return to signal error and move on } @@ -109,19 +114,31 @@ class AbstractFileStringExtract implements AbstractFileExtract { ingester.ingest(this); } catch (IOException ex) { - logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + aFile.getName(), ex); + logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); success = false; } finally { try { stringStream.close(); } catch (IOException ex) { - logger.log(Level.WARNING, "Error closing input stream stream, file: " + aFile.getName(), ex); + logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); } } return success; } + + @Override + public boolean isContentTypeSpecific() { + return false; + } + + @Override + public boolean isSupported(AbstractFile file) { + return true; + } + + } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java index fef7a7aed2..64f11529f4 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java @@ -42,11 +42,12 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; /** * Extractor of text from TIKA supported AbstractFile content. Extracted text is - * divided into chunks and indexed with Solr. - * Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism. - * If Tika extraction succeeds, chunks are indexed with Solr. + * divided into chunks and indexed with Solr. Protects against Tika parser hangs + * (for unexpected/corrupt content) using a timeout mechanism. If Tika + * extraction succeeds, chunks are indexed with Solr. * - * This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type. + * This Tika extraction/chunking utility is useful for large files of Tika + * parsers-supported content type. * */ public class AbstractFileTikaTextExtract implements AbstractFileExtract { @@ -58,19 +59,24 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { private static final int SINGLE_READ_CHARS = 1024; private static final int EXTRA_CHARS = 128; //for whitespace private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS]; - private static final Tika tika = new Tika(); + private Tika tika; private KeywordSearchIngestService service; - private Ingester ingester; - private AbstractFile sourceFile; + private static Ingester ingester; + private AbstractFile sourceFile; //currently processed file private int numChunks = 0; private static final String UTF16BOM = "\uFEFF"; private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); + // TODO: use a more robust method than checking file extension + // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika + static final String[] SUPPORTED_EXTENSIONS = {"tar", "jar", "zip", "gzip", "bzip2", + "gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest", + "bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav", + "pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"}; - AbstractFileTikaTextExtract(AbstractFile sourceFile) { - this.sourceFile = sourceFile; + AbstractFileTikaTextExtract() { this.service = KeywordSearchIngestService.getDefault(); - Server solrServer = KeywordSearch.getServer(); - ingester = solrServer.getIngester(); + ingester = Server.getIngester(); + tika = new Tika(); //tika.setMaxStringLength(MAX_EXTR_TEXT_CHARS); //for getting back string only } @@ -85,7 +91,10 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { } @Override - public boolean index() throws Ingester.IngesterException { + public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException { + this.sourceFile = sourceFile; + this.numChunks = 0; //unknown until indexing is done + boolean success = false; Reader reader = null; @@ -94,30 +103,30 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { try { Metadata meta = new Metadata(); /* Tika parse request with timeout -- disabled for now - ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile); - final Future future = tikaParseExecutor.submit(parseTask); - try { - future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); - } catch (TimeoutException te) { - final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName(); - logger.log(Level.WARNING, msg); - throw new IngesterException(msg); - } - catch (Exception ex) { - final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName(); - logger.log(Level.WARNING, msg, ex); - throw new IngesterException(msg); - } - - reader = parseTask.getReader(); - */ + ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile); + final Future future = tikaParseExecutor.submit(parseTask); try { + future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); + } catch (TimeoutException te) { + final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName(); + logger.log(Level.WARNING, msg); + throw new IngesterException(msg); + } + catch (Exception ex) { + final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName(); + logger.log(Level.WARNING, msg, ex); + throw new IngesterException(msg); + } + + reader = parseTask.getReader(); + */ + try { reader = tika.parse(stream, meta); } catch (IOException ex) { logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); reader = null; } - + if (reader == null) { //likely due to exception in parse() logger.log(Level.WARNING, "No reader available from Tika parse"); @@ -230,8 +239,25 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { return success; } + @Override + public boolean isContentTypeSpecific() { + return true; + } + + @Override + public boolean isSupported(AbstractFile file) { + String fileNameLower = file.getName().toLowerCase(); + for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) { + if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) { + return true; + } + } + return false; + } + /** - * Runnable and timeable task that calls tika to parse the content using streaming + * Runnable and timeable task that calls tika to parse the content using + * streaming */ private static class ParseRequestTask implements Runnable { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index ec8b96dd1b..1f59ac824c 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -65,12 +65,6 @@ public class Ingester { private final ExecutorService upRequestExecutor = Executors.newSingleThreadExecutor(); private final Server solrServer = KeywordSearch.getServer(); private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV(); - // TODO: use a more robust method than checking file extension - // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika - static final String[] ingestibleExtensions = {"tar", "jar", "zip", "gzip", "bzip2", - "gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest", - "bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav", - "pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"}; private static Ingester instance; @@ -452,29 +446,4 @@ public class Ingester { } } - /** - * Determine if the file content is ingestible/indexable by keyword search - * Ingestible abstract file is either a directory, or an allocated file with supported extensions. - * Note: currently only checks by extension and abstract type, it does not check actual file content. - * @param aFile - * @return true if it is ingestible, false otherwise - */ - static boolean isIngestible(AbstractFile aFile) { - TSK_DB_FILES_TYPE_ENUM aType = aFile.getType(); - if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) ) { - return false; - } - - FsContent fsContent = (FsContent) aFile; - - boolean isIngestible = false; - final String fileName = fsContent.getName(); - for (final String ext : ingestibleExtensions) { - if (fileName.toLowerCase().endsWith(ext)) { - isIngestible = true; - break; - } - } - return isIngestible; - } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java index de4843157d..0ef887dba3 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java @@ -95,6 +95,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi private SleuthkitCase caseHandle = null; private boolean skipKnown = true; private boolean initialized = false; + private List textExtractors; + private AbstractFileStringExtract stringExtractor; private enum IngestStatus { @@ -256,9 +258,15 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi this.managerProxy = managerProxy; - Server solrServer = KeywordSearch.getServer(); + ingester = Server.getIngester(); + + //initialize extractors + stringExtractor = new AbstractFileStringExtract(); + textExtractors = new ArrayList(); + //order matters, more specific extractors first + textExtractors.add(new AbstractFileHtmlExtract()); + textExtractors.add(new AbstractFileTikaTextExtract()); - ingester = solrServer.getIngester(); ingestStatus = new HashMap(); @@ -505,21 +513,44 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi * * @param aFile file to extract strings from, divide into chunks and * index - * @param stringsOnly true if use stinrg extraction, false if use Tika - * text extractor + * @param stringsOnly true if use string extraction, false if to use a + * content-type specific text extractor * @return true if the file was indexed, false otherwise + * @throws IngesterException exception thrown if indexing failed */ private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException { - AbstractFileExtract fileExtract; + AbstractFileExtract fileExtract = null; if (stringsOnly) { - fileExtract = new AbstractFileStringExtract(aFile); + fileExtract = stringExtractor; } else { - fileExtract = new AbstractFileTikaTextExtract(aFile); + //go over available text extractors and pick the first one (most specific one) + for (AbstractFileExtract fe : textExtractors) { + if (fe.isSupported(aFile)) { + fileExtract = fe; + break; + } + } } + if (fileExtract == null) { + throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName()); + } + + //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName()); + //divide into chunks and index - return fileExtract.index(); + return fileExtract.index(aFile); + } + + private boolean isTextExtractSupported(AbstractFile aFile) { + for (AbstractFileExtract extractor : textExtractors) { + if (extractor.isContentTypeSpecific() == true + && extractor.isSupported(aFile)) { + return true; + } + } + return false; } private void indexFile(AbstractFile aFile, boolean indexContent) { @@ -547,11 +578,10 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi return; } - boolean ingestibleFile = Ingester.isIngestible(aFile); - - if (fsContent != null && ingestibleFile == true) { - //we know it's an allocated fs file (FsContent) with supported content - //extract text with Tika, divide into chunks and index with Solr + boolean extractTextSupported = isTextExtractSupported(aFile); + if (fsContent != null && extractTextSupported) { + //we know it's an allocated FS file (since it's FsContent) + //extract text with one of the extractors, divide into chunks and index with Solr try { //logger.log(Level.INFO, "indexing: " + fsContent.getName()); if (!extractIndex(aFile, false)) { @@ -564,7 +594,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi } else { ingestStatus.put(aFile.getId(), IngestStatus.INGESTED); - } } catch (IngesterException e) { @@ -715,13 +744,13 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi for (String termResult : queryResult.keySet()) { List queryTermResults = queryResult.get(termResult); - + //translate to list of IDs that we keep track of List queryTermResultsIDs = new ArrayList(); for (ContentHit ch : queryTermResults) { queryTermResultsIDs.add(ch.getId()); } - + Keyword termResultK = new Keyword(termResult, !isRegex); List curTermResults = currentResults.get(termResultK); if (curTermResults == null) { @@ -938,7 +967,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi } } - /** * Set the skip known files setting on the service *