Make text chunks 32k or smaller.

This commit is contained in:
esaunders 2016-12-05 13:40:53 -05:00
parent 950ac1aea5
commit f96f831bb6
5 changed files with 11 additions and 5 deletions

View File

@ -43,7 +43,7 @@ class HtmlTextExtractor implements TextExtractor {
private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private static Ingester ingester;
static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
static final int MAX_EXTR_TEXT_CHARS = 31 * 1024;
private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace
private static final int MAX_SIZE = 50000000;

View File

@ -59,7 +59,7 @@ class Ingester {
//for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
//TODO use a streaming way to add content to /update handler
private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024;
private static final int MAX_DOC_CHUNK_SIZE = 32 * 1024;
private static final String ENCODING = "UTF-8"; //NON-NLS
private Ingester() {

View File

@ -97,6 +97,12 @@ public class Server {
return "content"; //NON-NLS
}
},
CONTENT_STR {
@Override
public String toString() {
return "content_str"; //NON-NLS
}
},
TEXT {
@Override
public String toString() {
@ -153,7 +159,7 @@ public class Server {
public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented)
//max content size we can send to Solr
public static final long MAX_CONTENT_SIZE = 1L * 1024 * 1024 * 1024;
public static final long MAX_CONTENT_SIZE = 1L * 31 * 1024 * 1024;
private static final Logger logger = Logger.getLogger(Server.class.getName());
private static final String DEFAULT_CORE_NAME = "coreCase"; //NON-NLS
public static final String CORE_EVT = "CORE_EVT"; //NON-NLS

View File

@ -40,7 +40,7 @@ class StringsTextExtractor implements TextExtractor {
private static Ingester ingester;
private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
private static final long MAX_STRING_CHUNK_SIZE = 1 * 31 * 1024L;
//private static final int BOM_LEN = 3;
private static final int BOM_LEN = 0; //disabled prepending of BOM
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;

View File

@ -62,7 +62,7 @@ class TikaTextExtractor implements TextExtractor {
private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
private static Ingester ingester;
private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
private static final int MAX_EXTR_TEXT_CHARS = 16 * 1024;
private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];