From f96f831bb697478695bbabeb120a32b8d2d02e49 Mon Sep 17 00:00:00 2001 From: esaunders Date: Mon, 5 Dec 2016 13:40:53 -0500 Subject: [PATCH] Make text chunks 32k or smaller. --- .../autopsy/keywordsearch/HtmlTextExtractor.java | 2 +- .../src/org/sleuthkit/autopsy/keywordsearch/Ingester.java | 2 +- .../src/org/sleuthkit/autopsy/keywordsearch/Server.java | 8 +++++++- .../autopsy/keywordsearch/StringsTextExtractor.java | 2 +- .../autopsy/keywordsearch/TikaTextExtractor.java | 2 +- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java index 27e9ccd637..3365865f0c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java @@ -43,7 +43,7 @@ class HtmlTextExtractor implements TextExtractor { private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); private static Ingester ingester; static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET; - static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; + static final int MAX_EXTR_TEXT_CHARS = 31 * 1024; private static final int SINGLE_READ_CHARS = 1024; private static final int EXTRA_CHARS = 128; //for whitespace private static final int MAX_SIZE = 50000000; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 256d4508f2..0995535538 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -59,7 +59,7 @@ class Ingester { //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika) //TODO use a streaming way to add content to /update handler - private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024; + private static final int MAX_DOC_CHUNK_SIZE = 32 * 1024; private static final String ENCODING = "UTF-8"; //NON-NLS private Ingester() { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java index 2304879fc9..f44eb9ec3f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java @@ -97,6 +97,12 @@ public class Server { return "content"; //NON-NLS } }, + CONTENT_STR { + @Override + public String toString() { + return "content_str"; //NON-NLS + } + }, TEXT { @Override public String toString() { @@ -153,7 +159,7 @@ public class Server { public static final String HL_ANALYZE_CHARS_UNLIMITED = "500000"; //max 1MB in a chunk. use -1 for unlimited, but -1 option may not be supported (not documented) //max content size we can send to Solr - public static final long MAX_CONTENT_SIZE = 1L * 1024 * 1024 * 1024; + public static final long MAX_CONTENT_SIZE = 1L * 31 * 1024 * 1024; private static final Logger logger = Logger.getLogger(Server.class.getName()); private static final String DEFAULT_CORE_NAME = "coreCase"; //NON-NLS public static final String CORE_EVT = "CORE_EVT"; //NON-NLS diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java index 3bbc97dcfc..3a3e1f9cfa 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java @@ -40,7 +40,7 @@ class StringsTextExtractor implements TextExtractor { private static Ingester ingester; private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); - private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; + private static final long MAX_STRING_CHUNK_SIZE = 1 * 31 * 1024L; //private static final int BOM_LEN = 3; private static final int BOM_LEN = 0; //disabled prepending of BOM private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java index 479931a754..2d49d5137c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java @@ -62,7 +62,7 @@ class TikaTextExtractor implements TextExtractor { private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName()); private static Ingester ingester; private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; - private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; + private static final int MAX_EXTR_TEXT_CHARS = 16 * 1024; private static final int SINGLE_READ_CHARS = 1024; private static final int EXTRA_CHARS = 128; //for whitespace private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];