Merge pull request #2601 from millmanorama/2107-application/x-font-ttf

cleanup exception throwing.  reinstate  application/x-font-ttf  as a …
This commit is contained in:
Richard Cordovano 2017-03-15 09:35:30 -04:00 committed by GitHub
commit 104d4bdee2

View File

@ -23,7 +23,6 @@ import java.io.IOException;
import java.io.PushbackReader; import java.io.PushbackReader;
import java.io.Reader; import java.io.Reader;
import java.util.List; import java.util.List;
import java.util.MissingResourceException;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
@ -51,9 +50,9 @@ class TikaTextExtractor extends FileTextExtractor {
private static final List<String> TIKA_SUPPORTED_TYPES private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext()) = new Tika().getParser().getSupportedTypes(new ParseContext())
.stream() .stream()
.map(mt -> mt.getType() + "/" + mt.getSubtype()) .map(mt -> mt.getType() + "/" + mt.getSubtype())
.collect(Collectors.toList()); .collect(Collectors.toList());
@Override @Override
public void logWarning(final String msg, Exception ex) { public void logWarning(final String msg, Exception ex) {
@ -62,7 +61,7 @@ class TikaTextExtractor extends FileTextExtractor {
} }
@Override @Override
public Reader getReader(AbstractFile sourceFile) throws TextExtractorException, MissingResourceException { public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {
ReadContentInputStream stream = new ReadContentInputStream(sourceFile); ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
@ -75,7 +74,7 @@ class TikaTextExtractor extends FileTextExtractor {
PushbackReader pushbackReader = new PushbackReader(tikaReader); PushbackReader pushbackReader = new PushbackReader(tikaReader);
int read = pushbackReader.read(); int read = pushbackReader.read();
if (read == -1) { if (read == -1) {
throw new TextExtractorException("Tika returned empty reader for " + sourceFile); throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + sourceFile);
} }
pushbackReader.unread(read); pushbackReader.unread(read);
@ -86,6 +85,8 @@ class TikaTextExtractor extends FileTextExtractor {
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName()); final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
logWarning(msg, te); logWarning(msg, te);
throw new TextExtractorException(msg, te); throw new TextExtractorException(msg, te);
} catch (TextExtractorException ex) {
throw ex;
} catch (Exception ex) { } catch (Exception ex) {
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName()); final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
@ -107,10 +108,10 @@ class TikaTextExtractor extends FileTextExtractor {
static private CharSource getMetaDataCharSource(Metadata metadata) { static private CharSource getMetaDataCharSource(Metadata metadata) {
return CharSource.wrap( return CharSource.wrap(
new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n") new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
.append(Stream.of(metadata.names()).sorted() .append(Stream.of(metadata.names()).sorted()
.map(key -> key + ": " + metadata.get(key)) .map(key -> key + ": " + metadata.get(key))
.collect(Collectors.joining("\n")) .collect(Collectors.joining("\n"))
)); ));
} }
@Override @Override
@ -124,8 +125,7 @@ class TikaTextExtractor extends FileTextExtractor {
|| FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used) || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|| FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat) || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS ) {
return false; return false;
} }
return TIKA_SUPPORTED_TYPES.contains(detectedFormat); return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
@ -167,7 +167,7 @@ class TikaTextExtractor extends FileTextExtractor {
private final Reader reader; private final Reader reader;
public ReaderCharSource(Reader reader) { ReaderCharSource(Reader reader) {
this.reader = reader; this.reader = reader;
} }