From 8fbad4fe5655bcdeb9d8030fe5d7be5033b34cc3 Mon Sep 17 00:00:00 2001 From: adam-m Date: Wed, 22 Feb 2012 10:14:21 -0500 Subject: [PATCH] Minor adjustments to keyword search: add htm extension as supported by Solr and set content type of extracted text to more compliant --- .../autopsy/keywordsearch/FsContentStringStream.java | 2 +- .../keywordsearch/GetIngestableFilesContentVisitor.java | 9 ++------- .../keywordsearch/KeywordSearchIngestService.java | 8 ++++++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java index db367335f8..a52d1e72e8 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java @@ -91,7 +91,7 @@ public class FsContentStringStream implements ContentStream { @Override public String getContentType() { - return encoding.toString(); + return "text/plain; charset = " + encoding.toString(); } @Override diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/GetIngestableFilesContentVisitor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/GetIngestableFilesContentVisitor.java index 79bd52cd09..30ac84c89f 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/GetIngestableFilesContentVisitor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/GetIngestableFilesContentVisitor.java @@ -43,13 +43,8 @@ import org.sleuthkit.datamodel.TskData; class GetIngestableFilesContentVisitor extends GetFilesContentVisitor { private static final Logger logger = Logger.getLogger(GetIngestableFilesContentVisitor.class.getName()); - // TODO: use a more robust method than checking file extension to determine - // whether to try a file - // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika - private static final String[] supportedExtensions = {"tar", "jar", "zip", "bzip2", - "gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "xhtml", "txt", - "bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav", - "pst", "xml", "class"}; + + private static final String[] supportedExtensions = KeywordSearchIngestService.ingestibleExtensions; // the full predicate of a SQLite statement to match supported extensions private static final String extensionsLikePredicate; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java index b31439706a..f8b3c86a63 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java @@ -73,8 +73,12 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent private volatile int messageID = 0; private volatile boolean finalRun = false; private SleuthkitCase caseHandle = null; - private static final String[] ingestibleExtensions = {"tar", "jar", "zip", "bzip2", - "gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "xhtml", "txt", + + // TODO: use a more robust method than checking file extension to determine + // whether to try a file + // supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika + static final String[] ingestibleExtensions = {"tar", "jar", "zip", "bzip2", + "gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav", "pst", "xml", "class"};