- add more archive extensions to skip

- better code to parse file extension
- do not call string extractor if it does not support the content
This commit is contained in:
adam-m 2013-01-23 12:23:12 -05:00
parent 455e4f6f89
commit c10872e42e
3 changed files with 29 additions and 19 deletions

View File

@ -54,8 +54,9 @@ class AbstractFileStringExtract implements AbstractFileExtract {
//string extractor extracts from all other than archives
//TODO use content type detection mechanism
static final String[] UNSUPPORTED_EXTENSIONS = {
//Archives (
"tar", "jar", "zip", "gzip", "bzip2", "gz", "tgz",};
//Archives
//Note: archive unpacker module will process these instead
"tar", "jar", "zip", "7z", "gzip", "bzip", "bzip2", "gz", "tgz", "cab", "rar", "arj", "dmg", "iso"};
//disabled prepending of BOM
//static {
@ -185,8 +186,13 @@ class AbstractFileStringExtract implements AbstractFileExtract {
@Override
public boolean isSupported(AbstractFile file) {
String fileNameLower = file.getName().toLowerCase();
int dotI = fileNameLower.lastIndexOf(".");
if (dotI == -1 || dotI == fileNameLower.length() - 1) {
return true; //no extension
}
final String extension = fileNameLower.substring(dotI + 1);
for (int i = 0; i < UNSUPPORTED_EXTENSIONS.length; ++i) {
if (fileNameLower.endsWith(UNSUPPORTED_EXTENSIONS[i])) {
if (extension.equals(UNSUPPORTED_EXTENSIONS[i])) {
return false;
}
}

View File

@ -71,16 +71,16 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
static final String[] SUPPORTED_EXTENSIONS = {
//MS Office
"doc", "dot", "docx", "docm", "dotx", "dotm",
"xls", "xlw", "xlt", "xlsx", "xlsm", "xltx", "xltm",
"ppt", "pps", "pot", "pptx", "pptm", "potx", "potm",
"xls", "xlw", "xlt", "xlsx", "xlsm", "xltx", "xltm",
"ppt", "pps", "pot", "pptx", "pptm", "potx", "potm",
//Open Office
"odf", "odt", "ott", "ods", "ots", "odp", "otp",
"sxw", "stw", "sxc", "stc", "sxi", "sxi",
"sdw", "sdc", "vor", "sgl",
"odf", "odt", "ott", "ods", "ots", "odp", "otp",
"sxw", "stw", "sxc", "stc", "sxi", "sxi",
"sdw", "sdc", "vor", "sgl",
//rich text, pdf
"rtf", "pdf",
"rtf", "pdf",
//html (other extractors take priority)
"html", "htm", "xhtml",
"html", "htm", "xhtml",
//text
"txt", "log", "manifest",
//images, media, other
@ -90,7 +90,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
AbstractFileTikaTextExtract() {
this.module = KeywordSearchIngestModule.getDefault();
ingester = Server.getIngester();
}
@Override
@ -102,15 +102,14 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
return null;
}
@Override
@Override
public Map<String, String> getOptions() {
return null;
}
@Override
public void setOptions(Map<String, String> options) {
}
@Override
@ -281,8 +280,13 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
@Override
public boolean isSupported(AbstractFile file) {
String fileNameLower = file.getName().toLowerCase();
int dotI = fileNameLower.lastIndexOf(".");
if (dotI == -1 || dotI == fileNameLower.length() - 1) {
return false; //no extension
}
final String extension = fileNameLower.substring(dotI + 1);
for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) {
if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
if (extension.equals(SUPPORTED_EXTENSIONS[i])) {
return true;
}
}
@ -318,8 +322,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
KeywordSearch.getTikaLogger().log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
tika = null;
reader = null;
}
catch (Exception ex) {
} catch (Exception ex) {
KeywordSearch.getTikaLogger().log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
tika = null;
reader = null;

View File

@ -696,7 +696,7 @@ public final class KeywordSearchIngestModule implements IngestModuleAbstractFile
private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
AbstractFileExtract fileExtract = null;
if (stringsOnly) {
if (stringsOnly && stringExtractor.isSupported(aFile)) {
fileExtract = stringExtractor;
} else {
//go over available text extractors and pick the first one (most specific one)
@ -709,7 +709,8 @@ public final class KeywordSearchIngestModule implements IngestModuleAbstractFile
}
if (fileExtract == null) {
throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
logger.log(Level.INFO, "No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
return false;
}
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());