mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-13 00:16:16 +00:00
Generalize text extractors more so we support multiple extractors in keyword search that are ordered from more to less specific ones.
Integrate html text extractor into keyword search.
This commit is contained in:
parent
ca87852431
commit
27e04f16d1
@ -41,8 +41,24 @@ interface AbstractFileExtract {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Index the Abstract File
|
* Index the Abstract File
|
||||||
|
* @param sourceFile file to index
|
||||||
* @return true if indexed successfully, false otherwise
|
* @return true if indexed successfully, false otherwise
|
||||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
*/
|
*/
|
||||||
boolean index() throws Ingester.IngesterException;
|
boolean index(AbstractFile sourceFile) throws Ingester.IngesterException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the extractor works only for specified types
|
||||||
|
* is supportedTypes() or whether is a generic content extractor (such as string extractor)
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
boolean isContentTypeSpecific();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the file content is supported by the extractor,
|
||||||
|
* if isContentTypeSpecific() returns true.
|
||||||
|
* @param file to test if its content should be supported
|
||||||
|
* @return true if the file content is supported, false otherwise
|
||||||
|
*/
|
||||||
|
boolean isSupported(AbstractFile file);
|
||||||
}
|
}
|
||||||
|
@ -46,12 +46,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
|||||||
private AbstractFile sourceFile;
|
private AbstractFile sourceFile;
|
||||||
private int numChunks = 0;
|
private int numChunks = 0;
|
||||||
private static final String UTF16BOM = "\uFEFF";
|
private static final String UTF16BOM = "\uFEFF";
|
||||||
|
private static final String [] SUPPORTED_EXTENSIONS = {"htm", "html", "xhtml", "css", "js"};
|
||||||
|
|
||||||
AbstractFileHtmlExtract(AbstractFile sourceFile) {
|
AbstractFileHtmlExtract() {
|
||||||
this.sourceFile = sourceFile;
|
|
||||||
this.service = KeywordSearchIngestService.getDefault();
|
this.service = KeywordSearchIngestService.getDefault();
|
||||||
Server solrServer = KeywordSearch.getServer();
|
ingester = Server.getIngester();
|
||||||
ingester = solrServer.getIngester();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -65,9 +64,13 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean index() throws IngesterException {
|
public boolean index(AbstractFile sourceFile) throws IngesterException {
|
||||||
|
this.sourceFile = sourceFile;
|
||||||
|
this.numChunks = 0; //unknown until indexing is done
|
||||||
|
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
Reader reader = null;
|
Reader reader = null;
|
||||||
|
|
||||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -173,4 +176,20 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
|||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isContentTypeSpecific() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isSupported(AbstractFile file) {
|
||||||
|
String fileNameLower = file.getName().toLowerCase();
|
||||||
|
for (int i = 0; i< SUPPORTED_EXTENSIONS.length; ++i) {
|
||||||
|
if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -40,24 +40,27 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
|||||||
private int numChunks;
|
private int numChunks;
|
||||||
private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
|
private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
|
||||||
static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
||||||
private AbstractFile aFile;
|
private AbstractFile sourceFile;
|
||||||
//single static buffer for all extractions. Safe, indexing can only happen in one thread
|
//single static buffer for all extractions. Safe, indexing can only happen in one thread
|
||||||
private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
||||||
private static final int BOM_LEN = 3;
|
private static final int BOM_LEN = 3;
|
||||||
|
|
||||||
|
//private static final StringExtract se = new StringExtract();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
//prepend UTF-8 BOM to start of the buffer
|
//prepend UTF-8 BOM to start of the buffer
|
||||||
STRING_CHUNK_BUF[0] = (byte) 0xEF;
|
STRING_CHUNK_BUF[0] = (byte) 0xEF;
|
||||||
STRING_CHUNK_BUF[1] = (byte) 0xBB;
|
STRING_CHUNK_BUF[1] = (byte) 0xBB;
|
||||||
STRING_CHUNK_BUF[2] = (byte) 0xBF;
|
STRING_CHUNK_BUF[2] = (byte) 0xBF;
|
||||||
|
|
||||||
|
//se.init();
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public AbstractFileStringExtract(AbstractFile aFile) {
|
public AbstractFileStringExtract() {
|
||||||
this.aFile = aFile;
|
|
||||||
numChunks = 0; //unknown until indexing is done
|
|
||||||
this.service = KeywordSearchIngestService.getDefault();
|
this.service = KeywordSearchIngestService.getDefault();
|
||||||
Server solrServer = KeywordSearch.getServer();
|
ingester = Server.getIngester();
|
||||||
ingester = solrServer.getIngester();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -67,15 +70,17 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public AbstractFile getSourceFile() {
|
public AbstractFile getSourceFile() {
|
||||||
return aFile;
|
return sourceFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean index() throws IngesterException {
|
public boolean index(AbstractFile sourceFile) throws IngesterException {
|
||||||
|
this.sourceFile = sourceFile;
|
||||||
|
this.numChunks = 0; //unknown until indexing is done
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
|
|
||||||
//construct stream that extracts text as we read it
|
//construct stream that extracts text as we read it
|
||||||
final InputStream stringStream = new AbstractFileStringStream(aFile, ByteContentStream.Encoding.UTF8);
|
final InputStream stringStream = new AbstractFileStringStream(sourceFile, ByteContentStream.Encoding.UTF8);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
success = true;
|
success = true;
|
||||||
@ -93,7 +98,7 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
|||||||
++this.numChunks;
|
++this.numChunks;
|
||||||
} catch (IngesterException ingEx) {
|
} catch (IngesterException ingEx) {
|
||||||
success = false;
|
success = false;
|
||||||
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ingEx);
|
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
|
||||||
throw ingEx; //need to rethrow/return to signal error and move on
|
throw ingEx; //need to rethrow/return to signal error and move on
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,19 +114,31 @@ class AbstractFileStringExtract implements AbstractFileExtract {
|
|||||||
ingester.ingest(this);
|
ingester.ingest(this);
|
||||||
|
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + aFile.getName(), ex);
|
logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex);
|
||||||
success = false;
|
success = false;
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
stringStream.close();
|
stringStream.close();
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
logger.log(Level.WARNING, "Error closing input stream stream, file: " + aFile.getName(), ex);
|
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isContentTypeSpecific() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isSupported(AbstractFile file) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -42,11 +42,12 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||||
* divided into chunks and indexed with Solr.
|
* divided into chunks and indexed with Solr. Protects against Tika parser hangs
|
||||||
* Protects against Tika parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
* (for unexpected/corrupt content) using a timeout mechanism. If Tika
|
||||||
* If Tika extraction succeeds, chunks are indexed with Solr.
|
* extraction succeeds, chunks are indexed with Solr.
|
||||||
*
|
*
|
||||||
* This Tika extraction/chunking utility is useful for large files of Tika parsers-supported content type.
|
* This Tika extraction/chunking utility is useful for large files of Tika
|
||||||
|
* parsers-supported content type.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||||
@ -58,19 +59,24 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
private static final int SINGLE_READ_CHARS = 1024;
|
private static final int SINGLE_READ_CHARS = 1024;
|
||||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||||
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
|
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHARS];
|
||||||
private static final Tika tika = new Tika();
|
private Tika tika;
|
||||||
private KeywordSearchIngestService service;
|
private KeywordSearchIngestService service;
|
||||||
private Ingester ingester;
|
private static Ingester ingester;
|
||||||
private AbstractFile sourceFile;
|
private AbstractFile sourceFile; //currently processed file
|
||||||
private int numChunks = 0;
|
private int numChunks = 0;
|
||||||
private static final String UTF16BOM = "\uFEFF";
|
private static final String UTF16BOM = "\uFEFF";
|
||||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||||
|
// TODO: use a more robust method than checking file extension
|
||||||
|
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
|
||||||
|
static final String[] SUPPORTED_EXTENSIONS = {"tar", "jar", "zip", "gzip", "bzip2",
|
||||||
|
"gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
|
||||||
|
"bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
|
||||||
|
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
|
||||||
|
|
||||||
AbstractFileTikaTextExtract(AbstractFile sourceFile) {
|
AbstractFileTikaTextExtract() {
|
||||||
this.sourceFile = sourceFile;
|
|
||||||
this.service = KeywordSearchIngestService.getDefault();
|
this.service = KeywordSearchIngestService.getDefault();
|
||||||
Server solrServer = KeywordSearch.getServer();
|
ingester = Server.getIngester();
|
||||||
ingester = solrServer.getIngester();
|
tika = new Tika();
|
||||||
//tika.setMaxStringLength(MAX_EXTR_TEXT_CHARS); //for getting back string only
|
//tika.setMaxStringLength(MAX_EXTR_TEXT_CHARS); //for getting back string only
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,7 +91,10 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean index() throws Ingester.IngesterException {
|
public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||||
|
this.sourceFile = sourceFile;
|
||||||
|
this.numChunks = 0; //unknown until indexing is done
|
||||||
|
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
Reader reader = null;
|
Reader reader = null;
|
||||||
|
|
||||||
@ -94,30 +103,30 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
try {
|
try {
|
||||||
Metadata meta = new Metadata();
|
Metadata meta = new Metadata();
|
||||||
/* Tika parse request with timeout -- disabled for now
|
/* Tika parse request with timeout -- disabled for now
|
||||||
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
||||||
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
||||||
try {
|
|
||||||
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
|
||||||
} catch (TimeoutException te) {
|
|
||||||
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
|
|
||||||
logger.log(Level.WARNING, msg);
|
|
||||||
throw new IngesterException(msg);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
|
|
||||||
logger.log(Level.WARNING, msg, ex);
|
|
||||||
throw new IngesterException(msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
reader = parseTask.getReader();
|
|
||||||
*/
|
|
||||||
try {
|
try {
|
||||||
|
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||||
|
} catch (TimeoutException te) {
|
||||||
|
final String msg = "Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||||
|
logger.log(Level.WARNING, msg);
|
||||||
|
throw new IngesterException(msg);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
final String msg = "Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
|
||||||
|
logger.log(Level.WARNING, msg, ex);
|
||||||
|
throw new IngesterException(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
reader = parseTask.getReader();
|
||||||
|
*/
|
||||||
|
try {
|
||||||
reader = tika.parse(stream, meta);
|
reader = tika.parse(stream, meta);
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
logger.log(Level.WARNING, "Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex);
|
||||||
reader = null;
|
reader = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reader == null) {
|
if (reader == null) {
|
||||||
//likely due to exception in parse()
|
//likely due to exception in parse()
|
||||||
logger.log(Level.WARNING, "No reader available from Tika parse");
|
logger.log(Level.WARNING, "No reader available from Tika parse");
|
||||||
@ -230,8 +239,25 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isContentTypeSpecific() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isSupported(AbstractFile file) {
|
||||||
|
String fileNameLower = file.getName().toLowerCase();
|
||||||
|
for (int i = 0; i < SUPPORTED_EXTENSIONS.length; ++i) {
|
||||||
|
if (fileNameLower.endsWith(SUPPORTED_EXTENSIONS[i])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runnable and timeable task that calls tika to parse the content using streaming
|
* Runnable and timeable task that calls tika to parse the content using
|
||||||
|
* streaming
|
||||||
*/
|
*/
|
||||||
private static class ParseRequestTask implements Runnable {
|
private static class ParseRequestTask implements Runnable {
|
||||||
|
|
||||||
|
@ -65,12 +65,6 @@ public class Ingester {
|
|||||||
private final ExecutorService upRequestExecutor = Executors.newSingleThreadExecutor();
|
private final ExecutorService upRequestExecutor = Executors.newSingleThreadExecutor();
|
||||||
private final Server solrServer = KeywordSearch.getServer();
|
private final Server solrServer = KeywordSearch.getServer();
|
||||||
private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
|
private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
|
||||||
// TODO: use a more robust method than checking file extension
|
|
||||||
// supported extensions list from http://www.lucidimagination.com/devzone/technical-articles/content-extraction-tika
|
|
||||||
static final String[] ingestibleExtensions = {"tar", "jar", "zip", "gzip", "bzip2",
|
|
||||||
"gz", "tgz", "odf", "doc", "xls", "ppt", "rtf", "pdf", "html", "htm", "xhtml", "txt", "log", "manifest",
|
|
||||||
"bmp", "gif", "png", "jpeg", "jpg", "tiff", "mp3", "aiff", "au", "midi", "wav",
|
|
||||||
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
|
|
||||||
|
|
||||||
|
|
||||||
private static Ingester instance;
|
private static Ingester instance;
|
||||||
@ -452,29 +446,4 @@ public class Ingester {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Determine if the file content is ingestible/indexable by keyword search
|
|
||||||
* Ingestible abstract file is either a directory, or an allocated file with supported extensions.
|
|
||||||
* Note: currently only checks by extension and abstract type, it does not check actual file content.
|
|
||||||
* @param aFile
|
|
||||||
* @return true if it is ingestible, false otherwise
|
|
||||||
*/
|
|
||||||
static boolean isIngestible(AbstractFile aFile) {
|
|
||||||
TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
|
|
||||||
if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) ) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
FsContent fsContent = (FsContent) aFile;
|
|
||||||
|
|
||||||
boolean isIngestible = false;
|
|
||||||
final String fileName = fsContent.getName();
|
|
||||||
for (final String ext : ingestibleExtensions) {
|
|
||||||
if (fileName.toLowerCase().endsWith(ext)) {
|
|
||||||
isIngestible = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return isIngestible;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -95,6 +95,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
private SleuthkitCase caseHandle = null;
|
private SleuthkitCase caseHandle = null;
|
||||||
private boolean skipKnown = true;
|
private boolean skipKnown = true;
|
||||||
private boolean initialized = false;
|
private boolean initialized = false;
|
||||||
|
private List<AbstractFileExtract> textExtractors;
|
||||||
|
private AbstractFileStringExtract stringExtractor;
|
||||||
|
|
||||||
private enum IngestStatus {
|
private enum IngestStatus {
|
||||||
|
|
||||||
@ -256,9 +258,15 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
|
|
||||||
this.managerProxy = managerProxy;
|
this.managerProxy = managerProxy;
|
||||||
|
|
||||||
Server solrServer = KeywordSearch.getServer();
|
ingester = Server.getIngester();
|
||||||
|
|
||||||
|
//initialize extractors
|
||||||
|
stringExtractor = new AbstractFileStringExtract();
|
||||||
|
textExtractors = new ArrayList<AbstractFileExtract>();
|
||||||
|
//order matters, more specific extractors first
|
||||||
|
textExtractors.add(new AbstractFileHtmlExtract());
|
||||||
|
textExtractors.add(new AbstractFileTikaTextExtract());
|
||||||
|
|
||||||
ingester = solrServer.getIngester();
|
|
||||||
|
|
||||||
ingestStatus = new HashMap<Long, IngestStatus>();
|
ingestStatus = new HashMap<Long, IngestStatus>();
|
||||||
|
|
||||||
@ -505,21 +513,44 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
*
|
*
|
||||||
* @param aFile file to extract strings from, divide into chunks and
|
* @param aFile file to extract strings from, divide into chunks and
|
||||||
* index
|
* index
|
||||||
* @param stringsOnly true if use stinrg extraction, false if use Tika
|
* @param stringsOnly true if use string extraction, false if to use a
|
||||||
* text extractor
|
* content-type specific text extractor
|
||||||
* @return true if the file was indexed, false otherwise
|
* @return true if the file was indexed, false otherwise
|
||||||
|
* @throws IngesterException exception thrown if indexing failed
|
||||||
*/
|
*/
|
||||||
private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
|
private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
|
||||||
AbstractFileExtract fileExtract;
|
AbstractFileExtract fileExtract = null;
|
||||||
|
|
||||||
if (stringsOnly) {
|
if (stringsOnly) {
|
||||||
fileExtract = new AbstractFileStringExtract(aFile);
|
fileExtract = stringExtractor;
|
||||||
} else {
|
} else {
|
||||||
fileExtract = new AbstractFileTikaTextExtract(aFile);
|
//go over available text extractors and pick the first one (most specific one)
|
||||||
|
for (AbstractFileExtract fe : textExtractors) {
|
||||||
|
if (fe.isSupported(aFile)) {
|
||||||
|
fileExtract = fe;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (fileExtract == null) {
|
||||||
|
throw new IngesterException("No supported file extractor found for file: " + aFile.getId() + " " + aFile.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
||||||
|
|
||||||
//divide into chunks and index
|
//divide into chunks and index
|
||||||
return fileExtract.index();
|
return fileExtract.index(aFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isTextExtractSupported(AbstractFile aFile) {
|
||||||
|
for (AbstractFileExtract extractor : textExtractors) {
|
||||||
|
if (extractor.isContentTypeSpecific() == true
|
||||||
|
&& extractor.isSupported(aFile)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void indexFile(AbstractFile aFile, boolean indexContent) {
|
private void indexFile(AbstractFile aFile, boolean indexContent) {
|
||||||
@ -547,11 +578,10 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean ingestibleFile = Ingester.isIngestible(aFile);
|
boolean extractTextSupported = isTextExtractSupported(aFile);
|
||||||
|
if (fsContent != null && extractTextSupported) {
|
||||||
if (fsContent != null && ingestibleFile == true) {
|
//we know it's an allocated FS file (since it's FsContent)
|
||||||
//we know it's an allocated fs file (FsContent) with supported content
|
//extract text with one of the extractors, divide into chunks and index with Solr
|
||||||
//extract text with Tika, divide into chunks and index with Solr
|
|
||||||
try {
|
try {
|
||||||
//logger.log(Level.INFO, "indexing: " + fsContent.getName());
|
//logger.log(Level.INFO, "indexing: " + fsContent.getName());
|
||||||
if (!extractIndex(aFile, false)) {
|
if (!extractIndex(aFile, false)) {
|
||||||
@ -564,7 +594,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
ingestStatus.put(aFile.getId(), IngestStatus.INGESTED);
|
ingestStatus.put(aFile.getId(), IngestStatus.INGESTED);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IngesterException e) {
|
} catch (IngesterException e) {
|
||||||
@ -715,13 +744,13 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
|
|
||||||
for (String termResult : queryResult.keySet()) {
|
for (String termResult : queryResult.keySet()) {
|
||||||
List<ContentHit> queryTermResults = queryResult.get(termResult);
|
List<ContentHit> queryTermResults = queryResult.get(termResult);
|
||||||
|
|
||||||
//translate to list of IDs that we keep track of
|
//translate to list of IDs that we keep track of
|
||||||
List<Long> queryTermResultsIDs = new ArrayList<Long>();
|
List<Long> queryTermResultsIDs = new ArrayList<Long>();
|
||||||
for (ContentHit ch : queryTermResults) {
|
for (ContentHit ch : queryTermResults) {
|
||||||
queryTermResultsIDs.add(ch.getId());
|
queryTermResultsIDs.add(ch.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
Keyword termResultK = new Keyword(termResult, !isRegex);
|
Keyword termResultK = new Keyword(termResult, !isRegex);
|
||||||
List<Long> curTermResults = currentResults.get(termResultK);
|
List<Long> curTermResults = currentResults.get(termResultK);
|
||||||
if (curTermResults == null) {
|
if (curTermResults == null) {
|
||||||
@ -938,7 +967,6 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the skip known files setting on the service
|
* Set the skip known files setting on the service
|
||||||
*
|
*
|
||||||
|
Loading…
x
Reference in New Issue
Block a user