Minor adjustments to keyword search: add htm extension as supported by Solr and set content type of extracted text to more compliant

This commit is contained in:
adam-m 2012-02-22 10:14:21 -05:00
parent 2e26827371
commit 8fbad4fe56
3 changed files with 9 additions and 10 deletions

View File

@ -91,7 +91,7 @@ public class FsContentStringStream implements ContentStream {
@Override @Override
public String getContentType() { public String getContentType() {
return encoding.toString(); return "text/plain; charset = " + encoding.toString();
} }
@Override @Override

View File

@ -43,13 +43,8 @@ import org.sleuthkit.datamodel.TskData;
class GetIngestableFilesContentVisitor extends GetFilesContentVisitor { class GetIngestableFilesContentVisitor extends GetFilesContentVisitor {
private static final Logger logger = Logger.getLogger(GetIngestableFilesContentVisitor.class.getName()); private static final Logger logger = Logger.getLogger(GetIngestableFilesContentVisitor.class.getName());
// TODO: use a more robust method than checking file extension to determine
// whether to try a file private static final String[] supportedExtensions = KeywordSearchIngestService.ingestibleExtensions;
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
private static final String[] supportedExtensions = {"tar", "jar", "zip", "bzip2",
"gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "xhtml", "txt",
"bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav",
"pst", "xml", "class"};
// the full predicate of a SQLite statement to match supported extensions // the full predicate of a SQLite statement to match supported extensions
private static final String extensionsLikePredicate; private static final String extensionsLikePredicate;

View File

@ -73,8 +73,12 @@ public final class KeywordSearchIngestService implements IngestServiceFsContent
private volatile int messageID = 0; private volatile int messageID = 0;
private volatile boolean finalRun = false; private volatile boolean finalRun = false;
private SleuthkitCase caseHandle = null; private SleuthkitCase caseHandle = null;
private static final String[] ingestibleExtensions = {"tar", "jar", "zip", "bzip2",
"gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "xhtml", "txt", // TODO: use a more robust method than checking file extension to determine
// whether to try a file
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
static final String[] ingestibleExtensions = {"tar", "jar", "zip", "bzip2",
"gz", "tgz", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt",
"bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav", "bmp", "gif", "png", "jpeg", "tiff", "mp3", "aiff", "au", "midi", "wav",
"pst", "xml", "class"}; "pst", "xml", "class"};