This commit is contained in:
Smoss 2013-03-26 12:32:32 -04:00
commit a16a2ee15a
2 changed files with 15 additions and 5 deletions

View File

@ -22,10 +22,12 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.Reader; import java.io.Reader;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
@ -38,6 +40,8 @@ import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
@ -66,11 +70,18 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
private int numChunks = 0; private int numChunks = 0;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<String>();
AbstractFileTikaTextExtract() { AbstractFileTikaTextExtract() {
this.module = KeywordSearchIngestModule.getDefault(); this.module = KeywordSearchIngestModule.getDefault();
ingester = Server.getIngester(); ingester = Server.getIngester();
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
for (MediaType mt : mediaTypes) {
TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
}
logger.log(Level.INFO, "Tika supported media types: " + TIKA_SUPPORTED_TYPES);
} }
@Override @Override
@ -272,12 +283,11 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
return false; return false;
} }
//TODO might need to add more mime-types to ignore //TODO might need to add more mime-types to ignore
//default to true, which includes //then accept all formats supported by Tika
//text, docs, pdf and others return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
return true;
} }

View File

@ -106,7 +106,7 @@
</zip> </zip>
<delete dir="${zip-tmp}"/> <delete dir="${zip-tmp}"/>
<delete file="${nbdist.dir}/${app.name}.zip"/> <!-- <delete file="${nbdist.dir}/${app.name}.zip"/> -->
<echo message=" "/> <echo message=" "/>
<echo message="cleaning and finalizing release" /> <echo message="cleaning and finalizing release" />