Minor adjustments to keyword search: add htm extension as supported by Solr and set content type of extracted text to more compliant

This commit is contained in:
adam-m 2012-02-22 10:14:21 -05:00
parent 2e26827371
commit 8fbad4fe56
3 changed files with 9 additions and 10 deletions

View File

@ -91,7 +91,7 @@ public class FsContentStringStream implements ContentStream {
@Override
public String getContentType() {
return encoding.toString();
return "text/plain; charset = " + encoding.toString();
}
@Override

View File

@ -43,13 +43,8 @@ import org.sleuthkit.datamodel.TskData;
class GetIngestableFilesContentVisitor extends GetFilesContentVisitor {
private static final Logger logger = Logger.getLogger(GetIngestableFilesContentVisitor.class.getName());
// TODO: use a more robust method than checking file extension to determine
// whether to try a file
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
private static final String[] supportedExtensions = {"tar", "jar", "zip", "bzip2",
"gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "xhtml", "txt",
"bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav",
"pst", "xml", "class"};
private static final String[] supportedExtensions = KeywordSearchIngestService.ingestibleExtensions;
// the full predicate of a SQLite statement to match supported extensions
private static final String extensionsLikePredicate;

View File

@ -73,8 +73,12 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
private volatile int messageID = 0;
private volatile boolean finalRun = false;
private SleuthkitCase caseHandle = null;
private static final String[] ingestibleExtensions = {"tar", "jar", "zip", "bzip2",
"gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "xhtml", "txt",
// TODO: use a more robust method than checking file extension to determine
// whether to try a file
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
static final String[] ingestibleExtensions = {"tar", "jar", "zip", "bzip2",
"gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt",
"bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav",
"pst", "xml", "class"};