mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-19 11:07:43 +00:00
Bug fix in KWS chunking
This commit is contained in:
parent
c36620e876
commit
939d2dc4e3
@ -45,33 +45,69 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
private static final Charset UTF_8 = StandardCharsets.UTF_8;
|
private static final Charset UTF_8 = StandardCharsets.UTF_8;
|
||||||
|
|
||||||
//Chunking algorithm paramaters-------------------------------------//
|
//Chunking algorithm paramaters-------------------------------------//
|
||||||
/** the maximum size of a chunk, including the window. */
|
/**
|
||||||
|
* the maximum size of a chunk, including the window.
|
||||||
|
*/
|
||||||
private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
|
private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
|
||||||
/** the minimum to read before we start the process of looking for
|
/**
|
||||||
* whitespace to break at and creating an overlapping window. */
|
* the minimum to read before we start the process of looking for whitespace
|
||||||
|
* to break at and creating an overlapping window.
|
||||||
|
*/
|
||||||
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
|
private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
|
||||||
/** The maximum size of the chunk, before the overlapping window, even if we
|
/**
|
||||||
* couldn't find whitespace to break at. */
|
* The maximum size of the chunk, before the overlapping window, even if we
|
||||||
|
* couldn't find whitespace to break at.
|
||||||
|
*/
|
||||||
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
|
private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
|
||||||
/** The amount of text we will read through before we give up on finding
|
/**
|
||||||
* whitespace to break the chunk/window at. */
|
* The amount of text we will read through before we give up on finding
|
||||||
|
* whitespace to break the chunk/window at.
|
||||||
|
*/
|
||||||
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
|
private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
|
||||||
/** The number of characters to read in one go from the Reader. */
|
/**
|
||||||
|
* The number of characters to read in one go from the Reader.
|
||||||
|
*/
|
||||||
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
|
private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
|
||||||
|
/**
|
||||||
|
* When toLowerCase() is called on a character, the lower cased output
|
||||||
|
* can be different in size than the original input. I have seen a single
|
||||||
|
* input character turn into 3 characters (and 5 bytes) after lowercasing.
|
||||||
|
* I could not find any info as to what is the upper limit of how much a
|
||||||
|
* character can "increase in size" during lower casing. I'm guestimating
|
||||||
|
* and setting that limit at 10 bytes.
|
||||||
|
*/
|
||||||
|
private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes
|
||||||
|
|
||||||
////chunker state--------------------------------------------///
|
////chunker state--------------------------------------------///
|
||||||
/** The Reader that this chunk reads from, and divides into chunks. It must
|
/**
|
||||||
* be a buffered reader to ensure that mark/reset are supported. */
|
* The Reader that this chunk reads from, and divides into chunks. It must
|
||||||
|
* be a buffered reader to ensure that mark/reset are supported.
|
||||||
|
*/
|
||||||
private final PushbackReader reader;
|
private final PushbackReader reader;
|
||||||
/** The local buffer of characters read from the Reader. */
|
/**
|
||||||
|
* The local buffer of characters read from the Reader.
|
||||||
|
*/
|
||||||
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
|
private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
|
||||||
|
|
||||||
/** the size in bytes of the chunk (so far). */
|
/**
|
||||||
|
* the size in bytes of the chunk (so far).
|
||||||
|
*/
|
||||||
private int chunkSizeBytes = 0;
|
private int chunkSizeBytes = 0;
|
||||||
/** Has the chunker reached the end of the Reader? If so, there are no more
|
|
||||||
* chunks, and the current chunk does not need a window. */
|
/**
|
||||||
|
* the size in bytes of the lowercased chunk (so far). Note that lowercasing
|
||||||
|
* in Java can change the size of the string so we need to make sure the
|
||||||
|
* lowercased string also fits in MAX_TOTAL_CHUNK_SIZE.
|
||||||
|
*/
|
||||||
|
private int lowerCasedChunkSizeBytes = 0;
|
||||||
|
/**
|
||||||
|
* Has the chunker reached the end of the Reader? If so, there are no more
|
||||||
|
* chunks, and the current chunk does not need a window.
|
||||||
|
*/
|
||||||
private boolean endOfReaderReached = false;
|
private boolean endOfReaderReached = false;
|
||||||
/** Store any exception encountered reading from the Reader. */
|
/**
|
||||||
|
* Store any exception encountered reading from the Reader.
|
||||||
|
*/
|
||||||
private Exception ex;
|
private Exception ex;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -140,7 +176,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
* @param s The string to cleanup.
|
* @param s The string to cleanup.
|
||||||
*
|
*
|
||||||
* @return A StringBuilder with the same content as s but where all invalid
|
* @return A StringBuilder with the same content as s but where all invalid
|
||||||
* code * points have been replaced.
|
* code * points have been replaced.
|
||||||
*/
|
*/
|
||||||
private static StringBuilder replaceInvalidUTF16(String s) {
|
private static StringBuilder replaceInvalidUTF16(String s) {
|
||||||
/* encode the string to UTF-16 which does the replcement, see
|
/* encode the string to UTF-16 which does the replcement, see
|
||||||
@ -162,16 +198,18 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
//reset state for the next chunk
|
//reset state for the next chunk
|
||||||
|
|
||||||
chunkSizeBytes = 0;
|
chunkSizeBytes = 0;
|
||||||
|
lowerCasedChunkSizeBytes = 0;
|
||||||
int baseChunkSizeChars = 0;
|
int baseChunkSizeChars = 0;
|
||||||
StringBuilder currentChunk = new StringBuilder();
|
StringBuilder currentChunk = new StringBuilder();
|
||||||
StringBuilder currentWindow = new StringBuilder();
|
StringBuilder currentWindow = new StringBuilder();
|
||||||
|
StringBuilder lowerCasedChunk = new StringBuilder();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
currentChunk.append(readBaseChunk());
|
readBaseChunk(currentChunk, lowerCasedChunk);
|
||||||
baseChunkSizeChars = currentChunk.length(); //save the base chunk length
|
baseChunkSizeChars = currentChunk.length(); //save the base chunk length
|
||||||
currentWindow.append(readWindow());
|
readWindow(currentWindow, lowerCasedChunk);
|
||||||
//add the window text to the current chunk.
|
//add the window text to the current chunk.
|
||||||
currentChunk.append(currentWindow);
|
currentChunk.append(currentWindow);
|
||||||
if (endOfReaderReached) {
|
if (endOfReaderReached) {
|
||||||
/* if we have reached the end of the content,we won't make
|
/* if we have reached the end of the content,we won't make
|
||||||
* another overlapping chunk, so the length of the base chunk
|
* another overlapping chunk, so the length of the base chunk
|
||||||
@ -186,9 +224,9 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
* and break any chunking loop in client code. */
|
* and break any chunking loop in client code. */
|
||||||
ex = ioEx;
|
ex = ioEx;
|
||||||
}
|
}
|
||||||
|
|
||||||
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
//sanitize the text and return a Chunk object, that includes the base chunk length.
|
||||||
return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
|
return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -196,14 +234,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
*
|
*
|
||||||
* @throws IOException if there is a problem reading from the reader.
|
* @throws IOException if there is a problem reading from the reader.
|
||||||
*/
|
*/
|
||||||
private StringBuilder readBaseChunk() throws IOException {
|
private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
|
||||||
StringBuilder currentChunk = new StringBuilder();
|
|
||||||
//read the chunk until the minimum base chunk size
|
//read the chunk until the minimum base chunk size
|
||||||
readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
|
readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
|
||||||
|
|
||||||
//keep reading until the maximum base chunk size or white space is reached.
|
//keep reading until the maximum base chunk size or white space is reached.
|
||||||
readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
|
readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
|
||||||
return currentChunk;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -211,14 +247,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
*
|
*
|
||||||
* @throws IOException if there is a problem reading from the reader.
|
* @throws IOException if there is a problem reading from the reader.
|
||||||
*/
|
*/
|
||||||
private StringBuilder readWindow() throws IOException {
|
private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
|
||||||
StringBuilder currentWindow = new StringBuilder();
|
|
||||||
//read the window, leaving some room to look for white space to break at.
|
//read the window, leaving some room to look for white space to break at.
|
||||||
readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
|
readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
|
||||||
|
|
||||||
//keep reading until the max chunk size, or until whitespace is reached.
|
//keep reading until the max chunk size, or until whitespace is reached.
|
||||||
readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
|
readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
|
||||||
return currentWindow;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -229,10 +263,10 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
*
|
*
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
|
private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException {
|
||||||
int charsRead = 0;
|
int charsRead = 0;
|
||||||
//read chars up to maxBytes, or the end of the reader.
|
//read chars up to maxBytes, or the end of the reader.
|
||||||
while ((chunkSizeBytes < maxBytes)
|
while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
|
||||||
&& (endOfReaderReached == false)) {
|
&& (endOfReaderReached == false)) {
|
||||||
charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
|
charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
|
||||||
if (-1 == charsRead) {
|
if (-1 == charsRead) {
|
||||||
@ -253,11 +287,19 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
//get the length in utf8 bytes of the read chars
|
//get the length in utf8 bytes of the read chars
|
||||||
int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
|
int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
|
||||||
|
|
||||||
|
// lower case the string and get it's size. NOTE: lower casing can
|
||||||
|
// change the size of the string!
|
||||||
|
String lowerCasedSegment = chunkSegment.toString().toLowerCase();
|
||||||
|
int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
|
||||||
|
|
||||||
//if it will not put us past maxBytes
|
//if it will not put us past maxBytes
|
||||||
if (chunkSizeBytes + segmentSize < maxBytes) {
|
if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
|
||||||
//add it to the chunk
|
//add it to the chunk
|
||||||
currentSegment.append(chunkSegment);
|
currentSegment.append(chunkSegment);
|
||||||
chunkSizeBytes += segmentSize;
|
chunkSizeBytes += segmentSize;
|
||||||
|
|
||||||
|
currentLowerCasedSegment.append(lowerCasedSegment);
|
||||||
|
lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
|
||||||
} else {
|
} else {
|
||||||
//unread it, and break out of read loop.
|
//unread it, and break out of read loop.
|
||||||
reader.unread(tempChunkBuf, 0, charsRead);
|
reader.unread(tempChunkBuf, 0, charsRead);
|
||||||
@ -275,11 +317,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
*
|
*
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
|
private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
|
||||||
int charsRead = 0;
|
int charsRead = 0;
|
||||||
boolean whitespaceFound = false;
|
boolean whitespaceFound = false;
|
||||||
//read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
|
//read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
|
||||||
while ((chunkSizeBytes < maxBytes)
|
while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
|
||||||
|
&& (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
|
||||||
&& (whitespaceFound == false)
|
&& (whitespaceFound == false)
|
||||||
&& (endOfReaderReached == false)) {
|
&& (endOfReaderReached == false)) {
|
||||||
charsRead = reader.read(tempChunkBuf, 0, 1);
|
charsRead = reader.read(tempChunkBuf, 0, 1);
|
||||||
@ -314,6 +357,12 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
//add read chars to the chunk and update the length.
|
//add read chars to the chunk and update the length.
|
||||||
currentChunk.append(sanitizedChunkSegment);
|
currentChunk.append(sanitizedChunkSegment);
|
||||||
chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
|
chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
|
||||||
|
|
||||||
|
// lower case the string and get it's size. NOTE: lower casing can
|
||||||
|
// change the size of the string.
|
||||||
|
String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
|
||||||
|
lowerCasedChunk.append(lowerCasedSegment);
|
||||||
|
lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -326,16 +375,16 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
|
|
||||||
private final StringBuilder sb;
|
private final StringBuilder sb;
|
||||||
private final int baseChunkSizeChars;
|
private final int baseChunkSizeChars;
|
||||||
private final int chunkSizeBytes;
|
private final StringBuilder lowerCasedChunk;
|
||||||
|
|
||||||
Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
|
Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
|
||||||
this.sb = sb;
|
this.sb = sb;
|
||||||
this.baseChunkSizeChars = baseChunkSizeChars;
|
this.baseChunkSizeChars = baseChunkSizeChars;
|
||||||
this.chunkSizeBytes = chunkSizeBytes;
|
this.lowerCasedChunk = lowerCasedChunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the content of the chunk.
|
* Get the content of the original (non-lower cased) chunk.
|
||||||
*
|
*
|
||||||
* @return The content of the chunk.
|
* @return The content of the chunk.
|
||||||
*/
|
*/
|
||||||
@ -345,16 +394,16 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the size in bytes of the utf-8 encoding of the entire chunk.
|
* Get the content of the lower cased chunk.
|
||||||
*
|
*
|
||||||
* @return the size in bytes of the utf-8 encoding of the entire chunk
|
* @return The content of the chunk.
|
||||||
*/
|
*/
|
||||||
public int getChunkSizeBytes() {
|
public String geLowerCasedChunk() {
|
||||||
return chunkSizeBytes;
|
return lowerCasedChunk.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the length of the base chunk in java chars.
|
* Get the length of the original (non-lower cased) base chunk in java chars.
|
||||||
*
|
*
|
||||||
* @return the length of the base chunk in java chars.
|
* @return the length of the base chunk in java chars.
|
||||||
*/
|
*/
|
||||||
|
@ -97,7 +97,7 @@ class Ingester {
|
|||||||
* file, but the Solr server is probably fine.
|
* file, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
|
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
|
||||||
indexChunk("", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
|
indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -111,7 +111,7 @@ class Ingester {
|
|||||||
* artifact, but the Solr server is probably fine.
|
* artifact, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
|
||||||
indexChunk("", sourceName, new HashMap<>(getContentFields(artifact)));
|
indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -156,7 +156,7 @@ class Ingester {
|
|||||||
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
|
logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk chunk = chunker.next();
|
Chunk chunk = chunker.next();
|
||||||
Map<String, Object> fields = new HashMap<>(contentFields);
|
Map<String, Object> fields = new HashMap<>(contentFields);
|
||||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||||
@ -166,7 +166,7 @@ class Ingester {
|
|||||||
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
|
language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
|
||||||
try {
|
try {
|
||||||
//add the chunk text to Solr index
|
//add the chunk text to Solr index
|
||||||
indexChunk(chunk.toString(), sourceName, fields);
|
indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields);
|
||||||
// add mini chunk when there's a language specific field
|
// add mini chunk when there's a language specific field
|
||||||
if (chunker.hasNext() && language.isPresent()) {
|
if (chunker.hasNext() && language.isPresent()) {
|
||||||
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
|
languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
|
||||||
@ -197,7 +197,7 @@ class Ingester {
|
|||||||
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
|
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
|
||||||
//"parent" docs don't have chunk_size
|
//"parent" docs don't have chunk_size
|
||||||
fields.remove(Server.Schema.CHUNK_SIZE.toString());
|
fields.remove(Server.Schema.CHUNK_SIZE.toString());
|
||||||
indexChunk(null, sourceName, fields);
|
indexChunk(null, null, sourceName, fields);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -211,12 +211,13 @@ class Ingester {
|
|||||||
* 4.0.0), see if possible to stream with UpdateRequestHandler
|
* 4.0.0), see if possible to stream with UpdateRequestHandler
|
||||||
*
|
*
|
||||||
* @param chunk The chunk content as a string, or null for metadata only
|
* @param chunk The chunk content as a string, or null for metadata only
|
||||||
|
* @param lowerCasedChunk The lower cased chunk content as a string, or null for metadata only
|
||||||
* @param fields
|
* @param fields
|
||||||
* @param size
|
* @param size
|
||||||
*
|
*
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
private void indexChunk(String chunk, String sourceName, Map<String, Object> fields) throws IngesterException {
|
private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {
|
||||||
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
||||||
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
||||||
// but does this really mean we don't want to index it?
|
// but does this really mean we don't want to index it?
|
||||||
@ -245,7 +246,7 @@ class Ingester {
|
|||||||
// insensitive substring/regular expression search.
|
// insensitive substring/regular expression search.
|
||||||
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
|
double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
|
||||||
if (indexSchemaVersion >= 2.1) {
|
if (indexSchemaVersion >= 2.1) {
|
||||||
updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : chunk.toLowerCase()));
|
updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));
|
||||||
}
|
}
|
||||||
|
|
||||||
TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
|
TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user