mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-12 07:56:16 +00:00
first pass at overlapping chunks
This commit is contained in:
parent
d14c15fbdb
commit
94e136b451
@ -526,6 +526,7 @@
|
|||||||
<!-- file chunk-specific fields (optional for others) -->
|
<!-- file chunk-specific fields (optional for others) -->
|
||||||
<!-- for a parent file with no content, number of chunks are specified -->
|
<!-- for a parent file with no content, number of chunks are specified -->
|
||||||
<field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
|
<field name="num_chunks" type="int" indexed="true" stored="true" required="false" />
|
||||||
|
<field name="chunk_size" type="int" indexed="true" stored="true" required="false" />
|
||||||
|
|
||||||
<!-- Common metadata fields, named specifically to match up with
|
<!-- Common metadata fields, named specifically to match up with
|
||||||
SolrCell metadata when parsing rich documents such as Word, PDF.
|
SolrCell metadata when parsing rich documents such as Word, PDF.
|
||||||
|
@ -58,7 +58,6 @@ class Ingester {
|
|||||||
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
||||||
private static Ingester instance;
|
private static Ingester instance;
|
||||||
|
|
||||||
|
|
||||||
private Ingester() {
|
private Ingester() {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,12 +159,13 @@ class Ingester {
|
|||||||
|
|
||||||
Chunker chunker = new Chunker(reader);
|
Chunker chunker = new Chunker(reader);
|
||||||
|
|
||||||
for (Chunk chunk : chunker) {
|
for (StringBuilder chunk : chunker) {
|
||||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||||
fields.put(Server.Schema.ID.toString(), chunkId);
|
fields.put(Server.Schema.ID.toString(), chunkId);
|
||||||
|
fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.length()));
|
||||||
try {
|
try {
|
||||||
//add the chunk text to Solr index
|
//add the chunk text to Solr index
|
||||||
indexChunk(chunk.getText().toString(), sourceName, fields);
|
indexChunk(chunk.toString(), sourceName, fields);
|
||||||
numChunks++;
|
numChunks++;
|
||||||
} catch (Ingester.IngesterException ingEx) {
|
} catch (Ingester.IngesterException ingEx) {
|
||||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||||
@ -370,41 +370,25 @@ class Ingester {
|
|||||||
super(message);
|
super(message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
class Chunk {
|
|
||||||
private final StringBuilder sb;
|
|
||||||
private final int chunksize;
|
|
||||||
|
|
||||||
Chunk(StringBuilder sb, int chunksize) {
|
|
||||||
this.sb = sb;
|
|
||||||
this.chunksize = chunksize;
|
|
||||||
}
|
|
||||||
|
|
||||||
StringBuilder getText() {
|
|
||||||
return sb;
|
|
||||||
}
|
|
||||||
|
|
||||||
int getSize() {
|
|
||||||
return chunksize;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encapsulates the content chunking algorithm in implementation of the Iterator
|
* Encapsulates the content chunking algorithm in implementation of the Iterator
|
||||||
* interface.
|
* interface.
|
||||||
*/
|
*/
|
||||||
class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
class Chunker implements Iterator<StringBuilder>, Iterable<StringBuilder> {
|
||||||
|
|
||||||
private static final int INITIAL_CHUNK_SIZE = 32 * 1024; //bytes
|
private static final int INITIAL_CHUNK_SIZE = 30 * 1024; //bytes
|
||||||
|
private static final int WHITE_SPACE_BUFFER_SIZE = 900; //bytes
|
||||||
|
private static final int MAX_WINDOW_SIZE = 1024; //bytes
|
||||||
private static final int SINGLE_READ_CHARS = 1024;
|
private static final int SINGLE_READ_CHARS = 1024;
|
||||||
|
|
||||||
private int chunkSizeBytes = 0; // the size in bytes of chunk (so far)
|
private int windowSizeBytes = 0;
|
||||||
|
private int chunkSizeBytes = 0; // the size in bytes of the base chunk (so far)
|
||||||
private int charsRead = 0; // number of chars read in the most recent read operation
|
private int charsRead = 0; // number of chars read in the most recent read operation
|
||||||
private boolean whitespace = false;
|
private boolean whitespace = false;
|
||||||
private char[] tempChunkBuf;
|
private char[] tempChunkBuf;
|
||||||
private StringBuilder chunkText;
|
private StringBuilder chunk;
|
||||||
private boolean endOfContent = false;
|
private boolean endOfContent = false;
|
||||||
private final Reader reader;
|
private final Reader reader;
|
||||||
|
|
||||||
@ -418,7 +402,7 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<Chunk> iterator() {
|
public Iterator<StringBuilder> iterator() {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -433,64 +417,6 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
return endOfContent == false;
|
return endOfContent == false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Chunk next() {
|
|
||||||
if (hasNext()) {
|
|
||||||
chunkText = new StringBuilder();
|
|
||||||
tempChunkBuf = new char[SINGLE_READ_CHARS];
|
|
||||||
chunkSizeBytes = 0;
|
|
||||||
//read chars up to initial chunk size
|
|
||||||
while (chunkSizeBytes < INITIAL_CHUNK_SIZE && endOfContent == false) {
|
|
||||||
try {
|
|
||||||
charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
|
|
||||||
} catch (IOException ex) {
|
|
||||||
throw new RuntimeException("IOException while attempting to read chunk.", ex);
|
|
||||||
}
|
|
||||||
if (-1 == charsRead) {
|
|
||||||
//this is the last chunk
|
|
||||||
endOfContent = true;
|
|
||||||
} else {
|
|
||||||
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
|
||||||
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
|
||||||
chunkText.append(chunkSegment);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
if (false == endOfContent) {
|
|
||||||
endOfContent = readChunkUntilWhiteSpace();
|
|
||||||
}
|
|
||||||
return new Chunk(sanitizeToUTF8(chunkText), chunkSizeBytes);
|
|
||||||
} else {
|
|
||||||
throw new NoSuchElementException("There are no more chunks.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean readChunkUntilWhiteSpace() {
|
|
||||||
charsRead = 0;
|
|
||||||
whitespace = false;
|
|
||||||
//if we haven't reached the end of the file,
|
|
||||||
//try to read char-by-char until whitespace to not break words
|
|
||||||
while ((chunkSizeBytes < INITIAL_CHUNK_SIZE)
|
|
||||||
&& (false == whitespace)) {
|
|
||||||
try {
|
|
||||||
charsRead = reader.read(tempChunkBuf, 0, 1);
|
|
||||||
} catch (IOException ex) {
|
|
||||||
throw new RuntimeException("IOException while attempting to read chunk until whitespace.", ex);
|
|
||||||
}
|
|
||||||
if (-1 == charsRead) {
|
|
||||||
//this is the last chunk
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
whitespace = Character.isWhitespace(tempChunkBuf[0]);
|
|
||||||
String chunkSegment = new String(tempChunkBuf, 0, 1);
|
|
||||||
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
|
||||||
chunkText.append(chunkSegment);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
||||||
* caret '^'
|
* caret '^'
|
||||||
@ -512,4 +438,121 @@ class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
|
|||||||
}
|
}
|
||||||
return sb;
|
return sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StringBuilder next() {
|
||||||
|
if (hasNext()) {
|
||||||
|
chunk = new StringBuilder();
|
||||||
|
tempChunkBuf = new char[SINGLE_READ_CHARS];
|
||||||
|
chunkSizeBytes = 0;
|
||||||
|
windowSizeBytes = 0;
|
||||||
|
//read chars up to initial chunk size
|
||||||
|
while (chunkSizeBytes < INITIAL_CHUNK_SIZE && endOfContent == false) {
|
||||||
|
try {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
throw new RuntimeException("IOException while attempting to read chunk.", ex);
|
||||||
|
}
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
endOfContent = true;
|
||||||
|
} else {
|
||||||
|
String chunkSegment = new String(tempChunkBuf, 0, charsRead);
|
||||||
|
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
||||||
|
chunk.append(chunkSegment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (false == endOfContent) {
|
||||||
|
try {
|
||||||
|
endOfContent = readChunkUntilWhiteSpace();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
throw new RuntimeException("IOException while attempting to read chunk to white space.", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sanitizeToUTF8(chunk);
|
||||||
|
} else {
|
||||||
|
throw new NoSuchElementException("There are no more chunks.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean readWindow() throws IOException {
|
||||||
|
tempChunkBuf = new char[MAX_WINDOW_SIZE];
|
||||||
|
charsRead = 0;
|
||||||
|
while (windowSizeBytes < MAX_WINDOW_SIZE) {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, SINGLE_READ_CHARS);
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
String windowSegment = new String(tempChunkBuf, 0, charsRead);
|
||||||
|
windowSizeBytes += Utf8.encodedLength(windowSegment);
|
||||||
|
chunk.append(windowSegment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return readWindowUntilWhiteSpace();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean readChunkUntilWhiteSpace() throws IOException {
|
||||||
|
charsRead = 0;
|
||||||
|
whitespace = false;
|
||||||
|
//if we haven't reached the end of the file,
|
||||||
|
//try to read char-by-char until whitespace to not break words
|
||||||
|
while ((chunkSizeBytes < INITIAL_CHUNK_SIZE + WHITE_SPACE_BUFFER_SIZE)
|
||||||
|
&& (false == whitespace)) {
|
||||||
|
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, 1);
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
whitespace = Character.isWhitespace(tempChunkBuf[0]);
|
||||||
|
String chunkSegment = new String(tempChunkBuf, 0, 1);
|
||||||
|
chunkSizeBytes += Utf8.encodedLength(chunkSegment);
|
||||||
|
chunk.append(chunkSegment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return readWindow();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean readWindowUntilWhiteSpace() throws IOException {
|
||||||
|
tempChunkBuf = new char[1];
|
||||||
|
charsRead = 0;
|
||||||
|
whitespace = false;
|
||||||
|
//if we haven't reached the end of the file,
|
||||||
|
//try to read char-by-char until whitespace to not break words
|
||||||
|
while ((windowSizeBytes < MAX_WINDOW_SIZE)
|
||||||
|
&& (false == whitespace)) {
|
||||||
|
charsRead = reader.read(tempChunkBuf, 0, 1);
|
||||||
|
if (-1 == charsRead) {
|
||||||
|
//this is the last chunk
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
whitespace = Character.isWhitespace(tempChunkBuf[0]);
|
||||||
|
String windowSegment = new String(tempChunkBuf, 0, 1);
|
||||||
|
windowSizeBytes += Utf8.encodedLength(windowSegment);
|
||||||
|
chunk.append(windowSegment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class Chunk {
|
||||||
|
|
||||||
|
private final StringBuilder sb;
|
||||||
|
private final int chunksize;
|
||||||
|
|
||||||
|
Chunk(StringBuilder sb, int chunksize) {
|
||||||
|
this.sb = sb;
|
||||||
|
this.chunksize = chunksize;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder getText() {
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
|
|
||||||
|
int getSize() {
|
||||||
|
return chunksize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,6 @@ import org.apache.solr.client.solrj.SolrServerException;
|
|||||||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||||
import org.apache.solr.client.solrj.impl.XMLResponseParser;
|
import org.apache.solr.client.solrj.impl.XMLResponseParser;
|
||||||
import org.apache.solr.client.solrj.request.CoreAdminRequest;
|
import org.apache.solr.client.solrj.request.CoreAdminRequest;
|
||||||
import org.apache.solr.client.solrj.response.CoreAdminResponse;
|
|
||||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||||
import org.apache.solr.client.solrj.response.TermsResponse;
|
import org.apache.solr.client.solrj.response.TermsResponse;
|
||||||
import org.apache.solr.common.SolrDocument;
|
import org.apache.solr.common.SolrDocument;
|
||||||
@ -148,6 +147,11 @@ public class Server {
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return "num_chunks"; //NON-NLS
|
return "num_chunks"; //NON-NLS
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
CHUNK_SIZE {
|
||||||
|
public String toString() {
|
||||||
|
return "chunk_size"; //NON-NLS
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -731,7 +735,7 @@ public class Server {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NamedList<Object> request(SolrRequest request) throws SolrServerException, NoOpenCoreException {
|
NamedList<Object> request(SolrRequest<?> request) throws SolrServerException, NoOpenCoreException {
|
||||||
currentCoreLock.readLock().lock();
|
currentCoreLock.readLock().lock();
|
||||||
try {
|
try {
|
||||||
if (null == currentCore) {
|
if (null == currentCore) {
|
||||||
@ -1222,7 +1226,7 @@ public class Server {
|
|||||||
return solrCore.query(sq);
|
return solrCore.query(sq);
|
||||||
}
|
}
|
||||||
|
|
||||||
private NamedList<Object> request(SolrRequest request) throws SolrServerException {
|
private NamedList<Object> request(SolrRequest<?> request) throws SolrServerException {
|
||||||
try {
|
try {
|
||||||
return solrCore.request(request);
|
return solrCore.request(request);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user