mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
- add more archive extensions to skip
- better code to parse file extension - do not call string extractor if it does not support the content
This commit is contained in:
parent
455e4f6f89
commit
c10872e42e
@ -54,8 +54,9 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
||||
//string extractor extracts from all other than archives
|
||||
//TODO use content type detection mechanism
|
||||
static final String[] UNSUPPORTED_EXTENSIONS = {
|
||||
//Archives (
|
||||
"tar", "jar", "zip", "gzip", "bzip2", "gz", "tgz",};
|
||||
//Archives
|
||||
//Note: archive unpacker module will process these instead
|
||||
"tar", "jar", "zip", "7z", "gzip", "bzip", "bzip2", "gz", "tgz", "cab", "rar", "arj", "dmg", "iso"};
|
||||
|
||||
//disabled prepending of BOM
|
||||
//static {
|
||||
@ -185,8 +186,13 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file) {
|
||||
String fileNameLower = file.getName().toLowerCase();
|
||||
int dotI = fileNameLower.lastIndexOf(".");
|
||||
if (dotI == -1 || dotI == fileNameLower.length() - 1) {
|
||||
return true; //no extension
|
||||
}
|
||||
final String extension = fileNameLower.substring(dotI + 1);
|
||||
for (int i = 0; i < UNSUPPORTED_EXTENSIONS.length; ++i) {
|
||||
if (fileNameLower.endsWith(UNSUPPORTED_EXTENSIONS[i])) {
|
||||
if (extension.equals(UNSUPPORTED_EXTENSIONS[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -71,16 +71,16 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
static final String[] SUPPORTED_EXTENSIONS = {
|
||||
//MS Office
|
||||
"doc", "dot", "docx", "docm", "dotx", "dotm",
|
||||
"xls", "xlw", "xlt", "xlsx", "xlsm", "xltx", "xltm",
|
||||
"ppt", "pps", "pot", "pptx", "pptm", "potx", "potm",
|
||||
"xls", "xlw", "xlt", "xlsx", "xlsm", "xltx", "xltm",
|
||||
"ppt", "pps", "pot", "pptx", "pptm", "potx", "potm",
|
||||
//Open Office
|
||||
"odf", "odt", "ott", "ods", "ots", "odp", "otp",
|
||||
"sxw", "stw", "sxc", "stc", "sxi", "sxi",
|
||||
"sdw", "sdc", "vor", "sgl",
|
||||
"odf", "odt", "ott", "ods", "ots", "odp", "otp",
|
||||
"sxw", "stw", "sxc", "stc", "sxi", "sxi",
|
||||
"sdw", "sdc", "vor", "sgl",
|
||||
//rich text, pdf
|
||||
"rtf", "pdf",
|
||||
"rtf", "pdf",
|
||||
//html (other extractors take priority)
|
||||
"html", "htm", "xhtml",
|
||||
"html", "htm", "xhtml",
|
||||
//text
|
||||
"txt", "log", "manifest",
|
||||
//images, media, other
|
||||
@ -90,7 +90,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
AbstractFileTikaTextExtract() {
|
||||
this.module = KeywordSearchIngestModule.getDefault();
|
||||
ingester = Server.getIngester();
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -102,15 +102,14 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@Override
|
||||
public Map<String, String> getOptions() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOptions(Map<String, String> options) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -281,8 +280,13 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file) {
|
||||
String fileNameLower = file.getName().toLowerCase();
|
||||
int dotI = fileNameLower.lastIndexOf(".");
|
||||
if (dotI == -1 || dotI == fileNameLower.length() - 1) {
|
||||
return false; //no extension
|
||||
}
|
||||
final String extension = fileNameLower.substring(dotI + 1);
|
||||
for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) {
|
||||
if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
|
||||
if (extension.equals(SUPPORTED_EXTENSIONS[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -318,8 +322,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
||||
tika = null;
|
||||
reader = null;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
||||
tika = null;
|
||||
reader = null;
|
||||
|
@ -696,7 +696,7 @@ public final class KeywordSearchIngestModule implements IngestModuleAbstractFile
|
||||
private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
|
||||
AbstractFileExtract fileExtract = null;
|
||||
|
||||
if (stringsOnly) {
|
||||
if (stringsOnly && stringExtractor.isSupported(aFile)) {
|
||||
fileExtract = stringExtractor;
|
||||
} else {
|
||||
//go over available text extractors and pick the first one (most specific one)
|
||||
@ -709,7 +709,8 @@ public final class KeywordSearchIngestModule implements IngestModuleAbstractFile
|
||||
}
|
||||
|
||||
if (fileExtract == null) {
|
||||
throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
|
||||
logger.log(Level.INFO, "No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
|
||||
return false;
|
||||
}
|
||||
|
||||
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
||||
|
Loading…
x
Reference in New Issue
Block a user