mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-13 08:26:15 +00:00
- append and index meta-data to Tika extracted content
- attempt not to break words when creating chunks from Tika extracted text
This commit is contained in:
parent
0c6a6a9776
commit
220946e240
@ -22,12 +22,16 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
|
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
|
import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -43,7 +47,9 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
private static final Logger logger = Logger.getLogger(IngestServiceAbstractFile.class.getName());
|
private static final Logger logger = Logger.getLogger(IngestServiceAbstractFile.class.getName());
|
||||||
private static final Encoding ENCODING = Encoding.UTF8;
|
private static final Encoding ENCODING = Encoding.UTF8;
|
||||||
static final Charset charset = Charset.forName(ENCODING.toString());
|
static final Charset charset = Charset.forName(ENCODING.toString());
|
||||||
static final int MAX_EXTR_TEXT_CHARS = 1 * 1024 * 1024;
|
static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
|
||||||
|
private static final int SINGLE_READ_CHARS = 1024;
|
||||||
|
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||||
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
|
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
|
||||||
private static final Tika tika = new Tika();
|
private static final Tika tika = new Tika();
|
||||||
private KeywordSearchIngestService service;
|
private KeywordSearchIngestService service;
|
||||||
@ -76,49 +82,70 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
Reader reader = null;
|
Reader reader = null;
|
||||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
try {
|
try {
|
||||||
reader = tika.parse(stream);
|
Metadata meta = new Metadata();
|
||||||
|
reader = tika.parse(stream, meta);
|
||||||
success = true;
|
success = true;
|
||||||
long readSize;
|
long readSize;
|
||||||
long totalRead = 0;
|
long totalRead = 0;
|
||||||
//we read max 1024 chars at time, this is max what reader would return it seems
|
boolean eof = false;
|
||||||
while ((readSize = reader.read(TEXT_CHUNK_BUF, 0, 1024)) != -1) {
|
//we read max 1024 chars at time, this seems to max what this Reader would return
|
||||||
|
while (!eof && (readSize = reader.read(TEXT_CHUNK_BUF, 0, SINGLE_READ_CHARS)) != -1) {
|
||||||
totalRead += readSize;
|
totalRead += readSize;
|
||||||
|
|
||||||
//consume more bytes to fill entire chunk
|
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
|
||||||
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1024)
|
while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
|
||||||
&& (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, 1024)) != -1) {
|
&& (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
|
||||||
totalRead += readSize;
|
totalRead += readSize;
|
||||||
}
|
}
|
||||||
|
if (readSize == -1) {
|
||||||
|
//this is the last chunk
|
||||||
|
eof = true;
|
||||||
|
} else {
|
||||||
|
//try to read until whitespace to not break words
|
||||||
|
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
|
||||||
|
&& !Character.isWhitespace(TEXT_CHUNK_BUF[(int) totalRead - 1])
|
||||||
|
&& (readSize = reader.read(TEXT_CHUNK_BUF, (int) totalRead, 1)) != -1) {
|
||||||
|
totalRead += readSize;
|
||||||
|
}
|
||||||
|
if (readSize == -1) {
|
||||||
|
//this is the last chunk
|
||||||
|
eof = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
|
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
|
||||||
|
|
||||||
//encode to bytes to index as byte stream
|
//encode to bytes to index as byte stream
|
||||||
String extracted;
|
String extracted;
|
||||||
if (totalRead < MAX_EXTR_TEXT_CHARS) {
|
|
||||||
//add BOM and trim the 0 bytes
|
//add BOM and trim the 0 bytes
|
||||||
StringBuilder sb = new StringBuilder((int) totalRead + 5);
|
StringBuilder sb = new StringBuilder((int) totalRead + 5);
|
||||||
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
|
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
|
||||||
sb.append(UTF16BOM);
|
sb.append(UTF16BOM);
|
||||||
sb.append(TEXT_CHUNK_BUF, 0, (int) readSize);
|
if (totalRead < MAX_EXTR_TEXT_CHARS) {
|
||||||
extracted = sb.toString();
|
sb.append(TEXT_CHUNK_BUF, 0, (int) totalRead);
|
||||||
|
} else {;
|
||||||
} else {
|
|
||||||
StringBuilder sb = new StringBuilder((int) totalRead + 5);
|
|
||||||
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
|
|
||||||
sb.append(UTF16BOM);
|
|
||||||
sb.append(TEXT_CHUNK_BUF);
|
sb.append(TEXT_CHUNK_BUF);
|
||||||
extracted = sb.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//sort meta data keys
|
||||||
|
List<String> sortedKeyList = Arrays.asList(meta.names());
|
||||||
|
Collections.sort(sortedKeyList);
|
||||||
|
|
||||||
|
//append meta data
|
||||||
|
sb.append("\n\n-------------------METADATA------------------------------\n\n");
|
||||||
|
for (String key : sortedKeyList) {
|
||||||
|
String value = meta.get(key);
|
||||||
|
sb.append(key).append(": ").append(value).append("\n");
|
||||||
|
}
|
||||||
|
extracted = sb.toString();
|
||||||
|
|
||||||
//reset for next chunk
|
//reset for next chunk
|
||||||
totalRead = 0;
|
totalRead = 0;
|
||||||
|
|
||||||
//converts BOM automatically to charSet encoding
|
//converts BOM automatically to charSet encoding
|
||||||
byte[] encodedBytes = extracted.getBytes(charset);
|
byte[] encodedBytes = extracted.getBytes(charset);
|
||||||
|
|
||||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
chunk.index(ingester, encodedBytes, encodedBytes.length, ENCODING);
|
chunk.index(ingester, encodedBytes, encodedBytes.length, ENCODING);
|
||||||
++this.numChunks;
|
++this.numChunks;
|
||||||
@ -133,9 +160,8 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
//not to delay commit if timer has gone off
|
//not to delay commit if timer has gone off
|
||||||
service.checkRunCommitSearch();
|
service.checkRunCommitSearch();
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId(), ex);
|
logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
||||||
success = false;
|
success = false;
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
@ -156,6 +182,5 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
ingester.ingest(this);
|
ingester.ingest(this);
|
||||||
|
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -569,6 +569,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
}
|
}
|
||||||
|
|
||||||
} catch (IngesterException e) {
|
} catch (IngesterException e) {
|
||||||
|
logger.log(Level.INFO, "Could not extract text with Tika, " + fsContent.getId() + ", "
|
||||||
|
+ fsContent.getName(), e);
|
||||||
ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
|
ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
|
||||||
//try to extract strings, if a file
|
//try to extract strings, if a file
|
||||||
if (fsContent.isFile() == true) {
|
if (fsContent.isFile() == true) {
|
||||||
@ -576,6 +578,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
}
|
}
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
logger.log(Level.WARNING, "Error extracting text with Tika, " + fsContent.getId() + ", "
|
||||||
|
+ fsContent.getName(), e);
|
||||||
ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
|
ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
|
||||||
//try to extract strings if a file
|
//try to extract strings if a file
|
||||||
if (fsContent.isFile() == true) {
|
if (fsContent.isFile() == true) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user