mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
TSK-519 Add support for files of known filetypes > 100 MB
(first take) - also fix thunderbird module deps so they work with keyword search module
This commit is contained in:
parent
fe402d2019
commit
8f26cda926
@ -16,6 +16,7 @@
|
|||||||
<dependency conf="solr-war->default" org="org.apache.solr" name="solr" rev="3.5.0" transitive="false" /> <!-- the war file -->
|
<dependency conf="solr-war->default" org="org.apache.solr" name="solr" rev="3.5.0" transitive="false" /> <!-- the war file -->
|
||||||
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="3.5.0"/>
|
<dependency conf="autopsy->*" org="org.apache.solr" name="solr-solrj" rev="3.5.0"/>
|
||||||
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
|
<dependency conf="autopsy->*" org="commons-lang" name="commons-lang" rev="2.4"/>
|
||||||
|
<dependency conf="autopsy->*" org="org.apache.tika" name="tika-parsers" rev="0.10"/>
|
||||||
<dependency conf="start-solr->default" org="org.mortbay.jetty" name="start" rev="6.1.26"/>
|
<dependency conf="start-solr->default" org="org.mortbay.jetty" name="start" rev="6.1.26"/>
|
||||||
<dependency conf="jetty-libs->default" org="org.mortbay.jetty" name="jetty" rev="6.1.26"/>
|
<dependency conf="jetty-libs->default" org="org.mortbay.jetty" name="jetty" rev="6.1.26"/>
|
||||||
<dependency conf="jetty-libs->default" org="org.mortbay.jetty" name="jsp-2.1" rev="6.1.14"/>
|
<dependency conf="jetty-libs->default" org="org.mortbay.jetty" name="jsp-2.1" rev="6.1.14"/>
|
||||||
|
@ -160,6 +160,10 @@
|
|||||||
<runtime-relative-path>ext/commons-httpclient-3.1.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-httpclient-3.1.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/commons-httpclient-3.1.jar</binary-origin>
|
<binary-origin>release/modules/ext/commons-httpclient-3.1.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/tika-core-0.10.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/tika-core-0.10.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/commons-codec-1.5.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-codec-1.5.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/commons-codec-1.5.jar</binary-origin>
|
<binary-origin>release/modules/ext/commons-codec-1.5.jar</binary-origin>
|
||||||
@ -168,6 +172,10 @@
|
|||||||
<runtime-relative-path>ext/commons-lang-2.4.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-lang-2.4.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/commons-lang-2.4.jar</binary-origin>
|
<binary-origin>release/modules/ext/commons-lang-2.4.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/tika-parsers-0.10.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/tika-parsers-0.10.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/jcl-over-slf4j-1.6.1.jar</runtime-relative-path>
|
<runtime-relative-path>ext/jcl-over-slf4j-1.6.1.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/jcl-over-slf4j-1.6.1.jar</binary-origin>
|
<binary-origin>release/modules/ext/jcl-over-slf4j-1.6.1.jar</binary-origin>
|
||||||
|
@ -0,0 +1,66 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2012 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents each string chunk to be indexed, a derivative of AbstractFileExtract file
|
||||||
|
*/
|
||||||
|
class AbstractFileChunk {
|
||||||
|
private int chunkID;
|
||||||
|
private AbstractFileExtract parent;
|
||||||
|
|
||||||
|
AbstractFileChunk(AbstractFileExtract parent, int chunkID) {
|
||||||
|
this.parent = parent;
|
||||||
|
this.chunkID = chunkID;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AbstractFileExtract getParent() {
|
||||||
|
return parent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getChunkId() {
|
||||||
|
return chunkID;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return String representation of the absolute id (parent and child)
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String getIdString() {
|
||||||
|
return Server.getChunkIdString(this.parent.getSourceFile().getId(), this.chunkID);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean index(Ingester ingester, byte[] content, long contentSize, ByteContentStream.Encoding encoding) throws IngesterException {
|
||||||
|
boolean success = true;
|
||||||
|
ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), encoding);
|
||||||
|
try {
|
||||||
|
ingester.ingest(this, bcs);
|
||||||
|
//logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID);
|
||||||
|
} catch (Exception ingEx) {
|
||||||
|
success = false;
|
||||||
|
throw new IngesterException("Problem ingesting file string chunk: " + parent.getSourceFile().getId() + ", chunk: " + chunkID, ingEx);
|
||||||
|
}
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2012 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Common methods for utilities that extract text and content and divide into
|
||||||
|
* chunks
|
||||||
|
*/
|
||||||
|
interface AbstractFileExtract {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get number of chunks resulted from extracting this AbstractFile
|
||||||
|
* @return the number of chunks produced
|
||||||
|
*/
|
||||||
|
int getNumChunks();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the source file associated with this extraction
|
||||||
|
* @return the source AbstractFile
|
||||||
|
*/
|
||||||
|
AbstractFile getSourceFile();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Index the Abstract File
|
||||||
|
* @return true if indexed successfully, false otherwise
|
||||||
|
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||||
|
*/
|
||||||
|
boolean index() throws Ingester.IngesterException;
|
||||||
|
}
|
@ -24,8 +24,7 @@ import java.io.InputStreamReader;
|
|||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
import org.apache.solr.common.util.ContentStream;
|
import org.apache.solr.common.util.ContentStream;
|
||||||
import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream;
|
import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
|
||||||
import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream.Encoding;
|
|
||||||
import org.sleuthkit.datamodel.AbstractContent;
|
import org.sleuthkit.datamodel.AbstractContent;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
|
||||||
@ -43,7 +42,7 @@ public class AbstractFileStringContentStream implements ContentStream {
|
|||||||
private AbstractFileStringStream stream;
|
private AbstractFileStringStream stream;
|
||||||
private static Logger logger = Logger.getLogger(AbstractFileStringContentStream.class.getName());
|
private static Logger logger = Logger.getLogger(AbstractFileStringContentStream.class.getName());
|
||||||
|
|
||||||
public AbstractFileStringContentStream(AbstractFile content, Encoding encoding) {
|
public AbstractFileStringContentStream(AbstractFile content, ByteContentStream.Encoding encoding) {
|
||||||
this.content = content;
|
this.content = content;
|
||||||
this.encoding = encoding;
|
this.encoding = encoding;
|
||||||
this.stream = new AbstractFileStringStream(content, encoding);
|
this.stream = new AbstractFileStringStream(content, encoding);
|
||||||
|
@ -0,0 +1,128 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2011 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes an AbstractFile, extract strings, converts into chunks (associated with the original
|
||||||
|
* source file) up to 1MB then and indexes chunks as text with Solr
|
||||||
|
*/
|
||||||
|
class AbstractFileStringExtract implements AbstractFileExtract {
|
||||||
|
|
||||||
|
private KeywordSearchIngestService service;
|
||||||
|
private Ingester ingester;
|
||||||
|
private int numChunks;
|
||||||
|
private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
|
||||||
|
static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
||||||
|
private AbstractFile aFile;
|
||||||
|
//single static buffer for all extractions. Safe, indexing can only happen in one thread
|
||||||
|
private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
||||||
|
private static final int BOM_LEN = 3;
|
||||||
|
|
||||||
|
static {
|
||||||
|
//prepend UTF-8 BOM to start of the buffer
|
||||||
|
STRING_CHUNK_BUF[0] = (byte) 0xEF;
|
||||||
|
STRING_CHUNK_BUF[1] = (byte) 0xBB;
|
||||||
|
STRING_CHUNK_BUF[2] = (byte) 0xBF;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AbstractFileStringExtract(AbstractFile aFile) {
|
||||||
|
this.aFile = aFile;
|
||||||
|
numChunks = 0; //unknown until indexing is done
|
||||||
|
this.service = KeywordSearchIngestService.getDefault();
|
||||||
|
Server solrServer = KeywordSearch.getServer();
|
||||||
|
ingester = solrServer.getIngester();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getNumChunks() {
|
||||||
|
return this.numChunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractFile getSourceFile() {
|
||||||
|
return aFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean index() throws IngesterException {
|
||||||
|
boolean success = false;
|
||||||
|
|
||||||
|
//construct stream that extracts text as we read it
|
||||||
|
final InputStream stringStream = new AbstractFileStringStream(aFile, ByteContentStream.Encoding.UTF8);
|
||||||
|
|
||||||
|
try {
|
||||||
|
success = true;
|
||||||
|
//break input stream into chunks
|
||||||
|
|
||||||
|
long readSize = 0;
|
||||||
|
while ((readSize = stringStream.read(STRING_CHUNK_BUF, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
|
||||||
|
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
|
||||||
|
//debug.write(STRING_CHUNK_BUF, 0, (int)readSize);
|
||||||
|
|
||||||
|
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||||
|
|
||||||
|
try {
|
||||||
|
chunk.index(ingester, STRING_CHUNK_BUF, readSize + BOM_LEN, ByteContentStream.Encoding.UTF8);
|
||||||
|
++this.numChunks;
|
||||||
|
} catch (IngesterException ingEx) {
|
||||||
|
success = false;
|
||||||
|
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ingEx);
|
||||||
|
throw ingEx; //need to rethrow/return to signal error and move on
|
||||||
|
}
|
||||||
|
|
||||||
|
//check if need invoke commit/search between chunks
|
||||||
|
//not to delay commit if timer has gone off
|
||||||
|
service.checkRunCommitSearch();
|
||||||
|
|
||||||
|
//debug.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//after all chunks, ingest the parent file without content itself, and store numChunks
|
||||||
|
ingester.ingest(this);
|
||||||
|
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + aFile.getName(), ex);
|
||||||
|
success = false;
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
stringStream.close();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Error closing input stream stream, file: " + aFile.getName(), ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -16,13 +16,14 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.datamodel;
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
|
import org.sleuthkit.autopsy.datamodel.DataConversion;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.TskException;
|
import org.sleuthkit.datamodel.TskException;
|
||||||
|
|
||||||
@ -33,16 +34,6 @@ import org.sleuthkit.datamodel.TskException;
|
|||||||
*/
|
*/
|
||||||
public class AbstractFileStringStream extends InputStream {
|
public class AbstractFileStringStream extends InputStream {
|
||||||
|
|
||||||
public static enum Encoding {
|
|
||||||
|
|
||||||
UTF8 {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "UTF-8";
|
|
||||||
}
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
//args
|
//args
|
||||||
private AbstractFile content;
|
private AbstractFile content;
|
||||||
@ -73,7 +64,7 @@ public class AbstractFileStringStream extends InputStream {
|
|||||||
* @param encoding target encoding, currently UTF-8
|
* @param encoding target encoding, currently UTF-8
|
||||||
* @param preserveOnBuffBoundary whether to preserve or split string on a buffer boundary. If false, will pack into read buffer up to max. possible, potentially splitting a string. If false, the string will be preserved for next read.
|
* @param preserveOnBuffBoundary whether to preserve or split string on a buffer boundary. If false, will pack into read buffer up to max. possible, potentially splitting a string. If false, the string will be preserved for next read.
|
||||||
*/
|
*/
|
||||||
public AbstractFileStringStream(AbstractFile content, Encoding encoding, boolean preserveOnBuffBoundary) {
|
public AbstractFileStringStream(AbstractFile content, ByteContentStream.Encoding encoding, boolean preserveOnBuffBoundary) {
|
||||||
this.content = content;
|
this.content = content;
|
||||||
this.encoding = encoding.toString();
|
this.encoding = encoding.toString();
|
||||||
//this.preserveOnBuffBoundary = preserveOnBuffBoundary;
|
//this.preserveOnBuffBoundary = preserveOnBuffBoundary;
|
||||||
@ -87,7 +78,7 @@ public class AbstractFileStringStream extends InputStream {
|
|||||||
* @param content to extract strings from
|
* @param content to extract strings from
|
||||||
* @param encoding target encoding, currently UTF-8
|
* @param encoding target encoding, currently UTF-8
|
||||||
*/
|
*/
|
||||||
public AbstractFileStringStream(AbstractFile content, Encoding encoding) {
|
public AbstractFileStringStream(AbstractFile content, ByteContentStream.Encoding encoding) {
|
||||||
this(content, encoding, false);
|
this(content, encoding, false);
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,149 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2012 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
import org.apache.tika.Tika;
|
||||||
|
import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||||
|
* divided into chunks and indexed with Solr.
|
||||||
|
*
|
||||||
|
* This is especially useful for large content of supported type that is to be
|
||||||
|
* divided into text chunks and indexed as such.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(IngestServiceAbstractFile.class.getName());
|
||||||
|
private static final Encoding ENCODING = Encoding.UTF8;
|
||||||
|
static final Charset charset = Charset.forName(ENCODING.toString());
|
||||||
|
static final int MAX_EXTR_TEXT_CHUNK_SIZE = 1 * 1024 * 1024;
|
||||||
|
private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHUNK_SIZE];
|
||||||
|
private static final Tika tika = new Tika();
|
||||||
|
private KeywordSearchIngestService service;
|
||||||
|
private Ingester ingester;
|
||||||
|
private AbstractFile sourceFile;
|
||||||
|
private int numChunks = 0;
|
||||||
|
private static final String UTF16BOM = "\uFEFF";
|
||||||
|
|
||||||
|
AbstractFileTikaTextExtract(AbstractFile sourceFile) {
|
||||||
|
this.sourceFile = sourceFile;
|
||||||
|
this.service = KeywordSearchIngestService.getDefault();
|
||||||
|
Server solrServer = KeywordSearch.getServer();
|
||||||
|
ingester = solrServer.getIngester();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getNumChunks() {
|
||||||
|
return numChunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractFile getSourceFile() {
|
||||||
|
return sourceFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean index() throws Ingester.IngesterException {
|
||||||
|
boolean success = false;
|
||||||
|
Reader reader = null;
|
||||||
|
try {
|
||||||
|
success = true;
|
||||||
|
reader = tika.parse(new ReadContentInputStream(sourceFile));
|
||||||
|
long readSize;
|
||||||
|
while ((readSize = reader.read(TEXT_CHUNK_BUF, 0, MAX_EXTR_TEXT_CHUNK_SIZE)) != -1) {
|
||||||
|
|
||||||
|
//encode to bytes to index as byte stream
|
||||||
|
String extracted;
|
||||||
|
if (readSize < MAX_EXTR_TEXT_CHUNK_SIZE) {
|
||||||
|
//trim the 0 bytes
|
||||||
|
StringBuilder sb = new StringBuilder((int) readSize + 5);
|
||||||
|
//inject BOM here (saves byte buffer realloc), will be converted to specific encoding BOM
|
||||||
|
sb.append(UTF16BOM);
|
||||||
|
sb.append(TEXT_CHUNK_BUF, 0, (int) readSize);
|
||||||
|
extracted = sb.toString();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
StringBuilder sb = new StringBuilder((int) readSize + 5);
|
||||||
|
//inject BOM here (saves byte buffer realloc), will be converted to specific encoding BOM
|
||||||
|
sb.append(UTF16BOM);
|
||||||
|
sb.append(TEXT_CHUNK_BUF);
|
||||||
|
extracted = sb.toString();
|
||||||
|
}
|
||||||
|
//converts BOM automatically to charSet encoding
|
||||||
|
byte[] encodedBytes = extracted.getBytes(charset);
|
||||||
|
|
||||||
|
|
||||||
|
//PrintStream s = new PrintStream("c:\\temp\\ps.txt");
|
||||||
|
//for (byte b : encodedBytes) {
|
||||||
|
// s.format("%02x ", b);
|
||||||
|
//}
|
||||||
|
//s.close();
|
||||||
|
|
||||||
|
//debug
|
||||||
|
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks + 1));
|
||||||
|
//debug.write(encodedBytes, 0, encodedBytes.length);
|
||||||
|
//debug.close();
|
||||||
|
|
||||||
|
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||||
|
|
||||||
|
try {
|
||||||
|
chunk.index(ingester, encodedBytes, encodedBytes.length, ENCODING);
|
||||||
|
++this.numChunks;
|
||||||
|
} catch (Ingester.IngesterException ingEx) {
|
||||||
|
success = false;
|
||||||
|
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '"
|
||||||
|
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
|
||||||
|
throw ingEx; //need to rethrow/return to signal error and move on
|
||||||
|
}
|
||||||
|
|
||||||
|
//check if need invoke commit/search between chunks
|
||||||
|
//not to delay commit if timer has gone off
|
||||||
|
service.checkRunCommitSearch();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId(), ex);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
reader.close();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//after all chunks, ingest the parent file without content itself, and store numChunks
|
||||||
|
ingester.ingest(this);
|
||||||
|
|
||||||
|
return success;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -25,15 +25,32 @@ import java.io.InputStreamReader;
|
|||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
import org.apache.solr.common.util.ContentStream;
|
import org.apache.solr.common.util.ContentStream;
|
||||||
import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream.Encoding;
|
|
||||||
import org.sleuthkit.datamodel.AbstractContent;
|
import org.sleuthkit.datamodel.AbstractContent;
|
||||||
import org.sleuthkit.datamodel.FsContent;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stream of bytes representing string with specified encoding
|
* Stream of bytes representing string with specified encoding
|
||||||
* to feed into Solr as ContentStream
|
* to feed into Solr as ContentStream
|
||||||
*/
|
*/
|
||||||
public class ByteContentStream implements ContentStream {
|
public class ByteContentStream implements ContentStream {
|
||||||
|
|
||||||
|
public static enum Encoding {
|
||||||
|
|
||||||
|
UTF8 {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "UTF-8";
|
||||||
|
}
|
||||||
|
},
|
||||||
|
UTF16 {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "UTF-16";
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
//input
|
//input
|
||||||
private byte[] content; //extracted subcontent
|
private byte[] content; //extracted subcontent
|
||||||
private long contentSize;
|
private long contentSize;
|
||||||
|
@ -1,170 +0,0 @@
|
|||||||
/*
|
|
||||||
* Autopsy Forensic Browser
|
|
||||||
*
|
|
||||||
* Copyright 2011 Basis Technology Corp.
|
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.logging.Level;
|
|
||||||
import java.util.logging.Logger;
|
|
||||||
import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream;
|
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Utility to extract strings and index a file with string content as chunks
|
|
||||||
* associated with the original parent file
|
|
||||||
*/
|
|
||||||
class FileExtract {
|
|
||||||
|
|
||||||
KeywordSearchIngestService service;
|
|
||||||
private int numChunks;
|
|
||||||
private static final Logger logger = Logger.getLogger(FileExtract.class.getName());
|
|
||||||
static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
|
|
||||||
private AbstractFile sourceFile;
|
|
||||||
|
|
||||||
//single static buffer for all extractions. Safe, indexing can only happen in one thread
|
|
||||||
private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
|
||||||
private static final int BOM_LEN = 3;
|
|
||||||
static {
|
|
||||||
//prepend UTF-8 BOM to start of the buffer
|
|
||||||
STRING_CHUNK_BUF[0] = (byte)0xEF;
|
|
||||||
STRING_CHUNK_BUF[1] = (byte)0xBB;
|
|
||||||
STRING_CHUNK_BUF[2] = (byte)0xBF;
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileExtract(KeywordSearchIngestService service, AbstractFile sourceFile) {
|
|
||||||
this.service = service;
|
|
||||||
this.sourceFile = sourceFile;
|
|
||||||
numChunks = 0; //unknown until indexing is done
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumChunks() {
|
|
||||||
return this.numChunks;
|
|
||||||
}
|
|
||||||
|
|
||||||
public AbstractFile getSourceFile() {
|
|
||||||
return sourceFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean index(Ingester ingester) throws IngesterException {
|
|
||||||
boolean success = false;
|
|
||||||
|
|
||||||
AbstractFileStringStream stringStream = null;
|
|
||||||
try {
|
|
||||||
success = true;
|
|
||||||
//break string into chunks
|
|
||||||
//Note: could use DataConversion.toString() since we are operating on fixed chunks
|
|
||||||
//but FsContentStringStream handles string boundary case better
|
|
||||||
stringStream = new AbstractFileStringStream(sourceFile, AbstractFileStringStream.Encoding.UTF8);
|
|
||||||
long readSize = 0;
|
|
||||||
|
|
||||||
while ((readSize = stringStream.read(STRING_CHUNK_BUF, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
|
|
||||||
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
|
|
||||||
//debug.write(STRING_CHUNK_BUF, 0, (int)readSize);
|
|
||||||
|
|
||||||
FileExtractedChild chunk = new FileExtractedChild(this, this.numChunks + 1);
|
|
||||||
|
|
||||||
try {
|
|
||||||
chunk.index(ingester, STRING_CHUNK_BUF, readSize + BOM_LEN);
|
|
||||||
++this.numChunks;
|
|
||||||
} catch (IngesterException ingEx) {
|
|
||||||
success = false;
|
|
||||||
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
|
|
||||||
throw ingEx; //need to rethrow/return to signal error and move on
|
|
||||||
}
|
|
||||||
|
|
||||||
//check if need invoke commit/search between chunks
|
|
||||||
//not to delay commit if timer has gone off
|
|
||||||
service.checkRunCommitSearch();
|
|
||||||
|
|
||||||
//debug.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//after all chunks, ingest the parent file without content itself, and store numChunks
|
|
||||||
ingester.ingest(this);
|
|
||||||
|
|
||||||
} catch (IOException ex) {
|
|
||||||
logger.log(Level.WARNING, "Unable to read string stream and send to Solr, file: " + sourceFile.getName(), ex);
|
|
||||||
success = false;
|
|
||||||
} finally {
|
|
||||||
if (stringStream != null) {
|
|
||||||
try {
|
|
||||||
stringStream.close();
|
|
||||||
} catch (IOException ex) {
|
|
||||||
logger.log(Level.WARNING, "Error closing string stream, file: " + sourceFile.getName(), ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return success;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Represents each string chunk to be indexed, a child of FileExtracted file
|
|
||||||
*/
|
|
||||||
class FileExtractedChild {
|
|
||||||
|
|
||||||
private int chunkID;
|
|
||||||
private FileExtract parent;
|
|
||||||
|
|
||||||
FileExtractedChild(FileExtract parent, int chunkID) {
|
|
||||||
this.parent = parent;
|
|
||||||
this.chunkID = chunkID;
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileExtract getParentFile() {
|
|
||||||
return parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getChunkId() {
|
|
||||||
return chunkID;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* return String representation of the absolute id (parent and child)
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public String getIdString() {
|
|
||||||
return getFileExtractChildId(this.parent.getSourceFile().getId(), this.chunkID);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean index(Ingester ingester, byte[] content, long contentSize) throws IngesterException {
|
|
||||||
boolean success = true;
|
|
||||||
ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), AbstractFileStringStream.Encoding.UTF8);
|
|
||||||
try {
|
|
||||||
ingester.ingest(this, bcs);
|
|
||||||
//logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID);
|
|
||||||
|
|
||||||
} catch (Exception ingEx) {
|
|
||||||
success = false;
|
|
||||||
throw new IngesterException("Problem ingesting file string chunk: " + parent.getSourceFile().getId() + ", chunk: " + chunkID, ingEx);
|
|
||||||
}
|
|
||||||
return success;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getFileExtractChildId(long parentID, int childID) {
|
|
||||||
return Long.toString(parentID) + Server.ID_CHUNK_SEP + Integer.toString(childID);
|
|
||||||
}
|
|
||||||
}
|
|
@ -73,6 +73,17 @@ public class Ingester {
|
|||||||
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
|
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
|
||||||
|
|
||||||
|
|
||||||
|
private static Ingester instance;
|
||||||
|
|
||||||
|
private Ingester() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static synchronized Ingester getDefault() {
|
||||||
|
if (instance == null)
|
||||||
|
instance = new Ingester();
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SuppressWarnings("FinalizeDeclaration")
|
@SuppressWarnings("FinalizeDeclaration")
|
||||||
@ -99,16 +110,16 @@ public class Ingester {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sends a FileExtract to Solr to have its content extracted and added to the
|
* Sends a AbstractFileExtract to Solr to have its content extracted and added to the
|
||||||
* index. commit() should be called once you're done ingesting files.
|
* index. commit() should be called once you're done ingesting files.
|
||||||
* FileExtract represents a parent of extracted file with actual content.
|
* FileExtract represents a parent of extracted file with actual content.
|
||||||
* The parent itself has no content, only meta data and is used to associate the extracted FileExtractedChild
|
* The parent itself has no content, only meta data and is used to associate the extracted AbstractFileChunk
|
||||||
*
|
*
|
||||||
* @param fe FileExtract to ingest
|
* @param fe AbstractFileExtract to ingest
|
||||||
* @throws IngesterException if there was an error processing a specific
|
* @throws IngesterException if there was an error processing a specific
|
||||||
* file, but the Solr server is probably fine.
|
* file, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void ingest(FileExtract fe) throws IngesterException {
|
void ingest(AbstractFileExtract fe) throws IngesterException {
|
||||||
Map<String, String> params = getContentFields(fe.getSourceFile());
|
Map<String, String> params = getContentFields(fe.getSourceFile());
|
||||||
|
|
||||||
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
|
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
|
||||||
@ -117,23 +128,23 @@ public class Ingester {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sends a FileExtractedChild to Solr and its extracted content stream to be added to the
|
* Sends a AbstractFileChunk to Solr and its extracted content stream to be added to the
|
||||||
* index. commit() should be called once you're done ingesting files.
|
* index. commit() should be called once you're done ingesting files.
|
||||||
* FileExtractedChild represents a file chunk and its chunk content.
|
* AbstractFileChunk represents a file chunk and its chunk content.
|
||||||
*
|
*
|
||||||
* @param fec FileExtractedChild to ingest
|
* @param fec AbstractFileChunk to ingest
|
||||||
* @throws IngesterException if there was an error processing a specific
|
* @throws IngesterException if there was an error processing a specific
|
||||||
* file, but the Solr server is probably fine.
|
* file, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void ingest(FileExtractedChild fec, ByteContentStream bcs) throws IngesterException {
|
void ingest(AbstractFileChunk fec, ByteContentStream bcs) throws IngesterException {
|
||||||
AbstractContent sourceContent = bcs.getSourceContent();
|
AbstractContent sourceContent = bcs.getSourceContent();
|
||||||
Map<String, String> params = getContentFields(sourceContent);
|
Map<String, String> params = getContentFields(sourceContent);
|
||||||
|
|
||||||
//overwrite id with the chunk id
|
//overwrite id with the chunk id
|
||||||
params.put(Server.Schema.ID.toString(),
|
params.put(Server.Schema.ID.toString(),
|
||||||
FileExtractedChild.getFileExtractChildId(sourceContent.getId(), fec.getChunkId()));
|
Server.getChunkIdString(sourceContent.getId(), fec.getChunkId()));
|
||||||
|
|
||||||
ingest(bcs, params, FileExtract.MAX_STRING_CHUNK_SIZE);
|
ingest(bcs, params, AbstractFileStringExtract.MAX_STRING_CHUNK_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -448,8 +459,9 @@ public class Ingester {
|
|||||||
*/
|
*/
|
||||||
static boolean isIngestible(AbstractFile aFile) {
|
static boolean isIngestible(AbstractFile aFile) {
|
||||||
TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
|
TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
|
||||||
if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) )
|
if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) ) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
FsContent fsContent = (FsContent) aFile;
|
FsContent fsContent = (FsContent) aFile;
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
|
|||||||
|
|
||||||
import java.awt.event.ActionEvent;
|
import java.awt.event.ActionEvent;
|
||||||
import java.awt.event.ActionListener;
|
import java.awt.event.ActionListener;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -40,7 +41,6 @@ import org.netbeans.api.progress.ProgressHandleFactory;
|
|||||||
import org.openide.util.Cancellable;
|
import org.openide.util.Cancellable;
|
||||||
import org.openide.util.Exceptions;
|
import org.openide.util.Exceptions;
|
||||||
import org.sleuthkit.autopsy.casemodule.Case;
|
import org.sleuthkit.autopsy.casemodule.Case;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestManager;
|
|
||||||
import org.sleuthkit.autopsy.ingest.IngestManagerProxy;
|
import org.sleuthkit.autopsy.ingest.IngestManagerProxy;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestMessage;
|
import org.sleuthkit.autopsy.ingest.IngestMessage;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
|
import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
|
||||||
@ -95,7 +95,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
private final String hashDBServiceName = "Hash Lookup"; //NOTE this needs to match the HashDB service getName()
|
private final String hashDBServiceName = "Hash Lookup"; //NOTE this needs to match the HashDB service getName()
|
||||||
private SleuthkitCase caseHandle = null;
|
private SleuthkitCase caseHandle = null;
|
||||||
private boolean skipKnown = true;
|
private boolean skipKnown = true;
|
||||||
boolean initialized = false;
|
private boolean initialized = false;
|
||||||
|
|
||||||
private enum IngestStatus {
|
private enum IngestStatus {
|
||||||
|
|
||||||
@ -200,6 +200,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
managerProxy.postMessage(IngestMessage.createMessage(++messageID, MessageType.INFO, this, "Completed"));
|
managerProxy.postMessage(IngestMessage.createMessage(++messageID, MessageType.INFO, this, "Completed"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//postSummary();
|
//postSummary();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,6 +225,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
runSearcher = false;
|
runSearcher = false;
|
||||||
finalSearcherDone = true;
|
finalSearcherDone = true;
|
||||||
|
|
||||||
|
|
||||||
//commit uncommited files, don't search again
|
//commit uncommited files, don't search again
|
||||||
commit();
|
commit();
|
||||||
|
|
||||||
@ -498,16 +500,27 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
|
|
||||||
private final Logger logger = Logger.getLogger(Indexer.class.getName());
|
private final Logger logger = Logger.getLogger(Indexer.class.getName());
|
||||||
|
|
||||||
private boolean extractAndIngest(AbstractFile aFile) {
|
/**
|
||||||
boolean indexed = false;
|
* Extract strings or text with Tika (by streaming) from the file Divide
|
||||||
final FileExtract fe = new FileExtract(KeywordSearchIngestService.this, aFile);
|
* the file into chunks and index the chunks
|
||||||
try {
|
*
|
||||||
indexed = fe.index(ingester);
|
* @param aFile file to extract strings from, divide into chunks and
|
||||||
} catch (IngesterException ex) {
|
* index
|
||||||
logger.log(Level.WARNING, "Error extracting strings and indexing file: " + aFile.getName(), ex);
|
* @param stringsOnly true if use stinrg extraction, false if use Tika
|
||||||
indexed = false;
|
* text extractor
|
||||||
|
* @return true if the file was indexed, false otherwise
|
||||||
|
*/
|
||||||
|
private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
|
||||||
|
AbstractFileExtract fileExtract;
|
||||||
|
|
||||||
|
if (stringsOnly) {
|
||||||
|
fileExtract = new AbstractFileStringExtract(aFile);
|
||||||
|
} else {
|
||||||
|
fileExtract = new AbstractFileTikaTextExtract(aFile);
|
||||||
}
|
}
|
||||||
return indexed;
|
|
||||||
|
//divide into chunks and index
|
||||||
|
return fileExtract.index();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void indexFile(AbstractFile aFile, boolean indexContent) {
|
private void indexFile(AbstractFile aFile, boolean indexContent) {
|
||||||
@ -537,9 +550,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
boolean ingestibleFile = Ingester.isIngestible(aFile);
|
boolean ingestibleFile = Ingester.isIngestible(aFile);
|
||||||
|
|
||||||
final long size = aFile.getSize();
|
final long size = aFile.getSize();
|
||||||
//if fs file, limit size of entire file, do not limit strings
|
//if fs file with no content (size is 0), index meta-data only
|
||||||
if (fsContent != null && (size == 0 || (ingestibleFile && size > MAX_INDEX_SIZE))) {
|
if (fsContent != null && size == 0) {
|
||||||
//if fs file, index meta only, otherwise if unalloc, skip
|
|
||||||
try {
|
try {
|
||||||
ingester.ingest(fsContent, false); //meta-data only
|
ingester.ingest(fsContent, false); //meta-data only
|
||||||
ingestStatus.put(aFile.getId(), IngestStatus.INGESTED_META);
|
ingestStatus.put(aFile.getId(), IngestStatus.INGESTED_META);
|
||||||
@ -548,15 +560,21 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
logger.log(Level.WARNING, "Unable to index meta-data for fsContent: " + fsContent.getId(), ex);
|
logger.log(Level.WARNING, "Unable to index meta-data for fsContent: " + fsContent.getId(), ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
} else if (fsContent != null && ingestibleFile == true) {
|
||||||
}
|
//we know it's an allocated fs file (FsContent) with supported content
|
||||||
|
//extract text with Tika, divide into chunks and index with Solr
|
||||||
if (fsContent != null && ingestibleFile == true) {
|
|
||||||
//we know it's an allocated fs file (FsContent) with supported content
|
|
||||||
try {
|
try {
|
||||||
//logger.log(Level.INFO, "indexing: " + fsContent.getName());
|
//logger.log(Level.INFO, "indexing: " + fsContent.getName());
|
||||||
ingester.ingest(fsContent, true);
|
//ingester.ingest(fsContent, true);
|
||||||
ingestStatus.put(fsContent.getId(), IngestStatus.INGESTED);
|
if (!extractIndex(aFile, false)) {
|
||||||
|
logger.log(Level.WARNING, "Failed to extract Tika text and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
|
||||||
|
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
ingestStatus.put(aFile.getId(), IngestStatus.INGESTED);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
} catch (IngesterException e) {
|
} catch (IngesterException e) {
|
||||||
ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
|
ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
|
||||||
//try to extract strings, if a file
|
//try to extract strings, if a file
|
||||||
@ -578,13 +596,19 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
|
|||||||
}
|
}
|
||||||
|
|
||||||
private boolean processNonIngestible(AbstractFile aFile) {
|
private boolean processNonIngestible(AbstractFile aFile) {
|
||||||
if (!extractAndIngest(aFile)) {
|
try {
|
||||||
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
|
if (!extractIndex(aFile, true)) {
|
||||||
|
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
|
||||||
|
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
ingestStatus.put(aFile.getId(), IngestStatus.EXTRACTED_INGESTED);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch (IngesterException ex) {
|
||||||
|
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex);
|
||||||
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
|
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
|
||||||
return false;
|
return false;
|
||||||
} else {
|
|
||||||
ingestStatus.put(aFile.getId(), IngestStatus.EXTRACTED_INGESTED);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -326,7 +326,7 @@ public class LuceneQuery implements KeywordSearchQuery {
|
|||||||
if (chunkID == 0) {
|
if (chunkID == 0) {
|
||||||
contentIDStr = Long.toString(contentID);
|
contentIDStr = Long.toString(contentID);
|
||||||
} else {
|
} else {
|
||||||
contentIDStr = FileExtractedChild.getFileExtractChildId(contentID, chunkID);
|
contentIDStr = Server.getChunkIdString(contentID, chunkID);
|
||||||
}
|
}
|
||||||
|
|
||||||
String idQuery = Server.Schema.ID.toString() + ":" + contentIDStr;
|
String idQuery = Server.Schema.ID.toString() + ":" + contentIDStr;
|
||||||
|
@ -60,74 +60,63 @@ class Server {
|
|||||||
public static enum Schema {
|
public static enum Schema {
|
||||||
|
|
||||||
ID {
|
ID {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "id";
|
return "id";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
CONTENT {
|
CONTENT {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "content";
|
return "content";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
CONTENT_WS {
|
CONTENT_WS {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "content_ws";
|
return "content_ws";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
FILE_NAME {
|
FILE_NAME {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "file_name";
|
return "file_name";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
CTIME {
|
CTIME {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "ctime";
|
return "ctime";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
ATIME {
|
ATIME {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "atime";
|
return "atime";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
MTIME {
|
MTIME {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "mtime";
|
return "mtime";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
CRTIME {
|
CRTIME {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "crtime";
|
return "crtime";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
NUM_CHUNKS {
|
NUM_CHUNKS {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "num_chunks";
|
return "num_chunks";
|
||||||
}
|
}
|
||||||
},};
|
},
|
||||||
|
};
|
||||||
public static final String HL_ANALYZE_CHARS_UNLIMITED = "-1";
|
public static final String HL_ANALYZE_CHARS_UNLIMITED = "-1";
|
||||||
|
|
||||||
//max content size we can send to Solr
|
//max content size we can send to Solr
|
||||||
public static final long MAX_CONTENT_SIZE = 1L * 1024 * 1024 * 1024;
|
public static final long MAX_CONTENT_SIZE = 1L * 1024 * 1024 * 1024;
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(Server.class.getName());
|
private static final Logger logger = Logger.getLogger(Server.class.getName());
|
||||||
private static final String DEFAULT_CORE_NAME = "coreCase";
|
private static final String DEFAULT_CORE_NAME = "coreCase";
|
||||||
// TODO: DEFAULT_CORE_NAME needs to be replaced with unique names to support multiple open cases
|
// TODO: DEFAULT_CORE_NAME needs to be replaced with unique names to support multiple open cases
|
||||||
@ -136,6 +125,8 @@ class Server {
|
|||||||
private String javaPath = "java";
|
private String javaPath = "java";
|
||||||
private static final int MAX_SOLR_MEM_MB = 512; //TODO set dynamically based on avail. system resources
|
private static final int MAX_SOLR_MEM_MB = 512; //TODO set dynamically based on avail. system resources
|
||||||
private Process curSolrProcess = null;
|
private Process curSolrProcess = null;
|
||||||
|
|
||||||
|
private static Ingester ingester = null;
|
||||||
|
|
||||||
public enum CORE_EVT_STATES {
|
public enum CORE_EVT_STATES {
|
||||||
|
|
||||||
@ -148,6 +139,7 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* New instance for the server at the given URL
|
* New instance for the server at the given URL
|
||||||
|
*
|
||||||
* @param url should be something like "http://localhost:8983/solr/"
|
* @param url should be something like "http://localhost:8983/solr/"
|
||||||
*/
|
*/
|
||||||
Server(String url) {
|
Server(String url) {
|
||||||
@ -220,7 +212,7 @@ class Server {
|
|||||||
bw.newLine();
|
bw.newLine();
|
||||||
if (Version.getBuildType() == Version.Type.DEVELOPMENT) {
|
if (Version.getBuildType() == Version.Type.DEVELOPMENT) {
|
||||||
//flush buffers if dev version for debugging
|
//flush buffers if dev version for debugging
|
||||||
bw.flush();
|
bw.flush();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
@ -237,7 +229,7 @@ class Server {
|
|||||||
void start() {
|
void start() {
|
||||||
logger.log(Level.INFO, "Starting Solr server from: " + solrFolder.getAbsolutePath());
|
logger.log(Level.INFO, "Starting Solr server from: " + solrFolder.getAbsolutePath());
|
||||||
try {
|
try {
|
||||||
final String MAX_SOLR_MEM_MB_PAR = " -Xmx" + Integer.toString(MAX_SOLR_MEM_MB) + "m";
|
final String MAX_SOLR_MEM_MB_PAR = " -Xmx" + Integer.toString(MAX_SOLR_MEM_MB) + "m";
|
||||||
final String SOLR_START_CMD = javaPath + MAX_SOLR_MEM_MB_PAR + " -DSTOP.PORT=8079 -DSTOP.KEY=mysecret -jar start.jar";
|
final String SOLR_START_CMD = javaPath + MAX_SOLR_MEM_MB_PAR + " -DSTOP.PORT=8079 -DSTOP.KEY=mysecret -jar start.jar";
|
||||||
logger.log(Level.INFO, "Starting Solr using: " + SOLR_START_CMD);
|
logger.log(Level.INFO, "Starting Solr using: " + SOLR_START_CMD);
|
||||||
curSolrProcess = Runtime.getRuntime().exec(SOLR_START_CMD, null, solrFolder);
|
curSolrProcess = Runtime.getRuntime().exec(SOLR_START_CMD, null, solrFolder);
|
||||||
@ -259,9 +251,8 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Tries to stop a Solr instance.
|
* Tries to stop a Solr instance.
|
||||||
*
|
*
|
||||||
* Waits for the stop command to finish
|
* Waits for the stop command to finish before returning.
|
||||||
* before returning.
|
|
||||||
*/
|
*/
|
||||||
synchronized void stop() {
|
synchronized void stop() {
|
||||||
try {
|
try {
|
||||||
@ -283,8 +274,11 @@ class Server {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests if there's a Solr server running by sending it a core-status request.
|
* Tests if there's a Solr server running by sending it a core-status
|
||||||
* @return false if the request failed with a connection error, otherwise true
|
* request.
|
||||||
|
*
|
||||||
|
* @return false if the request failed with a connection error, otherwise
|
||||||
|
* true
|
||||||
*/
|
*/
|
||||||
synchronized boolean isRunning() {
|
synchronized boolean isRunning() {
|
||||||
|
|
||||||
@ -311,7 +305,9 @@ class Server {
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
/**** Convenience methods for use while we only open one case at a time ****/
|
/**
|
||||||
|
* ** Convenience methods for use while we only open one case at a time ***
|
||||||
|
*/
|
||||||
private volatile Core currentCore = null;
|
private volatile Core currentCore = null;
|
||||||
|
|
||||||
synchronized void openCore() {
|
synchronized void openCore() {
|
||||||
@ -331,11 +327,14 @@ class Server {
|
|||||||
serverAction.putValue(CORE_EVT, CORE_EVT_STATES.STOPPED);
|
serverAction.putValue(CORE_EVT, CORE_EVT_STATES.STOPPED);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**** end single-case specific methods ****/
|
/**
|
||||||
|
* ** end single-case specific methods ***
|
||||||
|
*/
|
||||||
/**
|
/**
|
||||||
* Open a core for the given case
|
* Open a core for the given case
|
||||||
|
*
|
||||||
* @param c
|
* @param c
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
synchronized Core openCore(Case c) {
|
synchronized Core openCore(Case c) {
|
||||||
String sep = File.separator;
|
String sep = File.separator;
|
||||||
@ -345,6 +344,7 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* commit current core if it exists
|
* commit current core if it exists
|
||||||
|
*
|
||||||
* @throws SolrServerException, NoOpenCoreException
|
* @throws SolrServerException, NoOpenCoreException
|
||||||
*/
|
*/
|
||||||
synchronized void commit() throws SolrServerException, NoOpenCoreException {
|
synchronized void commit() throws SolrServerException, NoOpenCoreException {
|
||||||
@ -362,10 +362,12 @@ class Server {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute query that gets only number of all Solr files indexed
|
* Execute query that gets only number of all Solr files indexed without
|
||||||
* without actually returning the files. The result does not include chunks, only number of actual files.
|
* actually returning the files. The result does not include chunks, only
|
||||||
|
* number of actual files.
|
||||||
|
*
|
||||||
* @return int representing number of indexed files
|
* @return int representing number of indexed files
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
*/
|
*/
|
||||||
public int queryNumIndexedFiles() throws SolrServerException, NoOpenCoreException {
|
public int queryNumIndexedFiles() throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -374,12 +376,13 @@ class Server {
|
|||||||
|
|
||||||
return currentCore.queryNumIndexedFiles();
|
return currentCore.queryNumIndexedFiles();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute query that gets only number of all Solr documents indexed (files and chunks)
|
* Execute query that gets only number of all Solr documents indexed (files
|
||||||
* without actually returning the documents
|
* and chunks) without actually returning the documents
|
||||||
|
*
|
||||||
* @return int representing number of indexed files (files and chunks)
|
* @return int representing number of indexed files (files and chunks)
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
*/
|
*/
|
||||||
public int queryNumIndexedDocuments() throws SolrServerException, NoOpenCoreException {
|
public int queryNumIndexedDocuments() throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -391,6 +394,7 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Return true if the file is indexed (either as a whole as a chunk)
|
* Return true if the file is indexed (either as a whole as a chunk)
|
||||||
|
*
|
||||||
* @param contentID
|
* @param contentID
|
||||||
* @return true if it is indexed
|
* @return true if it is indexed
|
||||||
* @throws SolrServerException, NoOpenCoreException
|
* @throws SolrServerException, NoOpenCoreException
|
||||||
@ -405,9 +409,11 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute query that gets number of indexed file chunks for a file
|
* Execute query that gets number of indexed file chunks for a file
|
||||||
|
*
|
||||||
* @param fileID file id of the original file broken into chunks and indexed
|
* @param fileID file id of the original file broken into chunks and indexed
|
||||||
* @return int representing number of indexed file chunks, 0 if there is no chunks
|
* @return int representing number of indexed file chunks, 0 if there is no
|
||||||
* @throws SolrServerException
|
* chunks
|
||||||
|
* @throws SolrServerException
|
||||||
*/
|
*/
|
||||||
public int queryNumFileChunks(long fileID) throws SolrServerException, NoOpenCoreException {
|
public int queryNumFileChunks(long fileID) throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -419,10 +425,11 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute solr query
|
* Execute solr query
|
||||||
|
*
|
||||||
* @param sq query
|
* @param sq query
|
||||||
* @return query response
|
* @return query response
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
* @throws NoOpenCoreException
|
* @throws NoOpenCoreException
|
||||||
*/
|
*/
|
||||||
public QueryResponse query(SolrQuery sq) throws SolrServerException, NoOpenCoreException {
|
public QueryResponse query(SolrQuery sq) throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -433,11 +440,12 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute solr query
|
* Execute solr query
|
||||||
|
*
|
||||||
* @param sq the query
|
* @param sq the query
|
||||||
* @param method http method to use
|
* @param method http method to use
|
||||||
* @return query response
|
* @return query response
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
* @throws NoOpenCoreException
|
* @throws NoOpenCoreException
|
||||||
*/
|
*/
|
||||||
public QueryResponse query(SolrQuery sq, SolrRequest.METHOD method) throws SolrServerException, NoOpenCoreException {
|
public QueryResponse query(SolrQuery sq, SolrRequest.METHOD method) throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -448,10 +456,11 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute Solr terms query
|
* Execute Solr terms query
|
||||||
|
*
|
||||||
* @param sq the query
|
* @param sq the query
|
||||||
* @return terms response
|
* @return terms response
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
* @throws NoOpenCoreException
|
* @throws NoOpenCoreException
|
||||||
*/
|
*/
|
||||||
public TermsResponse queryTerms(SolrQuery sq) throws SolrServerException, NoOpenCoreException {
|
public TermsResponse queryTerms(SolrQuery sq) throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -462,10 +471,11 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute Solr query to get content text
|
* Execute Solr query to get content text
|
||||||
|
*
|
||||||
* @param content to get the text for
|
* @param content to get the text for
|
||||||
* @return content text string
|
* @return content text string
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
* @throws NoOpenCoreException
|
* @throws NoOpenCoreException
|
||||||
*/
|
*/
|
||||||
public String getSolrContent(final Content content) throws SolrServerException, NoOpenCoreException {
|
public String getSolrContent(final Content content) throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -473,14 +483,16 @@ class Server {
|
|||||||
}
|
}
|
||||||
return currentCore.getSolrContent(content.getId(), 0);
|
return currentCore.getSolrContent(content.getId(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute Solr query to get content text from content chunk
|
* Execute Solr query to get content text from content chunk
|
||||||
|
*
|
||||||
* @param content to get the text for
|
* @param content to get the text for
|
||||||
* @param chunkID chunk number to query (starting at 1), or 0 if there is no chunks for that content
|
* @param chunkID chunk number to query (starting at 1), or 0 if there is no
|
||||||
|
* chunks for that content
|
||||||
* @return content text string
|
* @return content text string
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
* @throws NoOpenCoreException
|
* @throws NoOpenCoreException
|
||||||
*/
|
*/
|
||||||
public String getSolrContent(final Content content, int chunkID) throws SolrServerException, NoOpenCoreException {
|
public String getSolrContent(final Content content, int chunkID) throws SolrServerException, NoOpenCoreException {
|
||||||
if (currentCore == null) {
|
if (currentCore == null) {
|
||||||
@ -490,15 +502,28 @@ class Server {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* factory method to create ingester
|
* Method to return ingester instance
|
||||||
* @return ingester
|
*
|
||||||
|
* @return ingester instance
|
||||||
*/
|
*/
|
||||||
public Ingester getIngester() {
|
public static Ingester getIngester() {
|
||||||
return new Ingester();
|
return Ingester.getDefault();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given file parent id and child chunk ID, return the ID string of the chunk
|
||||||
|
* as stored in Solr, e.g. FILEID_CHUNKID
|
||||||
|
* @param parentID the parent file id (id of the source content)
|
||||||
|
* @param childID the child chunk id
|
||||||
|
* @return formatted string id
|
||||||
|
*/
|
||||||
|
public static String getChunkIdString(long parentID, int childID) {
|
||||||
|
return Long.toString(parentID) + Server.ID_CHUNK_SEP + Integer.toString(childID);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Open a new core
|
* Open a new core
|
||||||
|
*
|
||||||
* @param coreName name to refer to the core by in Solr
|
* @param coreName name to refer to the core by in Solr
|
||||||
* @param dataDir directory to load/store the core data from/to
|
* @param dataDir directory to load/store the core data from/to
|
||||||
* @return new core
|
* @return new core
|
||||||
@ -574,13 +599,13 @@ class Server {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getSolrContent(long contentID, int chunkID) {
|
||||||
private String getSolrContent(long contentID, int chunkID) {
|
|
||||||
final SolrQuery q = new SolrQuery();
|
final SolrQuery q = new SolrQuery();
|
||||||
q.setQuery("*:*");
|
q.setQuery("*:*");
|
||||||
String filterQuery = Schema.ID.toString() + ":" + contentID;
|
String filterQuery = Schema.ID.toString() + ":" + contentID;
|
||||||
if (chunkID != 0)
|
if (chunkID != 0) {
|
||||||
filterQuery = filterQuery + Server.ID_CHUNK_SEP + chunkID;
|
filterQuery = filterQuery + Server.ID_CHUNK_SEP + chunkID;
|
||||||
|
}
|
||||||
q.addFilterQuery(filterQuery);
|
q.addFilterQuery(filterQuery);
|
||||||
q.setFields(Schema.CONTENT.toString());
|
q.setFields(Schema.CONTENT.toString());
|
||||||
try {
|
try {
|
||||||
@ -602,11 +627,12 @@ class Server {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute query that gets only number of all Solr files (not chunks) indexed
|
* Execute query that gets only number of all Solr files (not chunks)
|
||||||
* without actually returning the files
|
* indexed without actually returning the files
|
||||||
*
|
*
|
||||||
* @return int representing number of indexed files (entire files, not chunks)
|
* @return int representing number of indexed files (entire files, not
|
||||||
* @throws SolrServerException
|
* chunks)
|
||||||
|
* @throws SolrServerException
|
||||||
*/
|
*/
|
||||||
private int queryNumIndexedFiles() throws SolrServerException {
|
private int queryNumIndexedFiles() throws SolrServerException {
|
||||||
SolrQuery q = new SolrQuery(Server.Schema.ID + ":*" + Server.ID_CHUNK_SEP + "*");
|
SolrQuery q = new SolrQuery(Server.Schema.ID + ":*" + Server.ID_CHUNK_SEP + "*");
|
||||||
@ -614,14 +640,15 @@ class Server {
|
|||||||
int numChunks = (int) query(q).getResults().getNumFound();
|
int numChunks = (int) query(q).getResults().getNumFound();
|
||||||
return queryNumIndexedDocuments() - numChunks;
|
return queryNumIndexedDocuments() - numChunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute query that gets only number of all Solr documents indexed
|
* Execute query that gets only number of all Solr documents indexed
|
||||||
* without actually returning the documents. Documents include entire indexed files
|
* without actually returning the documents. Documents include entire
|
||||||
* as well as chunks, which are treated as documents.
|
* indexed files as well as chunks, which are treated as documents.
|
||||||
*
|
*
|
||||||
* @return int representing number of indexed documents (entire files and chunks)
|
* @return int representing number of indexed documents (entire files
|
||||||
* @throws SolrServerException
|
* and chunks)
|
||||||
|
* @throws SolrServerException
|
||||||
*/
|
*/
|
||||||
private int queryNumIndexedDocuments() throws SolrServerException {
|
private int queryNumIndexedDocuments() throws SolrServerException {
|
||||||
SolrQuery q = new SolrQuery("*:*");
|
SolrQuery q = new SolrQuery("*:*");
|
||||||
@ -631,9 +658,10 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Return true if the file is indexed (either as a whole as a chunk)
|
* Return true if the file is indexed (either as a whole as a chunk)
|
||||||
|
*
|
||||||
* @param contentID
|
* @param contentID
|
||||||
* @return true if it is indexed
|
* @return true if it is indexed
|
||||||
* @throws SolrServerException
|
* @throws SolrServerException
|
||||||
*/
|
*/
|
||||||
private boolean queryIsIndexed(long contentID) throws SolrServerException {
|
private boolean queryIsIndexed(long contentID) throws SolrServerException {
|
||||||
SolrQuery q = new SolrQuery("*:*");
|
SolrQuery q = new SolrQuery("*:*");
|
||||||
@ -645,12 +673,15 @@ class Server {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute query that gets number of indexed file chunks for a file
|
* Execute query that gets number of indexed file chunks for a file
|
||||||
* @param contentID file id of the original file broken into chunks and indexed
|
*
|
||||||
* @return int representing number of indexed file chunks, 0 if there is no chunks
|
* @param contentID file id of the original file broken into chunks and
|
||||||
* @throws SolrServerException
|
* indexed
|
||||||
|
* @return int representing number of indexed file chunks, 0 if there is
|
||||||
|
* no chunks
|
||||||
|
* @throws SolrServerException
|
||||||
*/
|
*/
|
||||||
private int queryNumFileChunks(long contentID) throws SolrServerException {
|
private int queryNumFileChunks(long contentID) throws SolrServerException {
|
||||||
final SolrQuery q =
|
final SolrQuery q =
|
||||||
new SolrQuery(Server.Schema.ID + ":" + Long.toString(contentID) + Server.ID_CHUNK_SEP + "*");
|
new SolrQuery(Server.Schema.ID + ":" + Long.toString(contentID) + Server.ID_CHUNK_SEP + "*");
|
||||||
q.setRows(0);
|
q.setRows(0);
|
||||||
return (int) query(q).getResults().getNumFound();
|
return (int) query(q).getResults().getNumFound();
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
file.reference.commons-lang-2.4.jar=release/modules/ext/commons-lang-2.4.jar
|
file.reference.commons-lang-2.4.jar=release/modules/ext/commons-lang-2.4.jar
|
||||||
file.reference.tika-core-1.1.jar=release/modules/ext/tika-core-1.1.jar
|
|
||||||
file.reference.tika-parsers-1.1.jar=release/modules/ext/tika-parsers-1.1.jar
|
|
||||||
javac.source=1.6
|
javac.source=1.6
|
||||||
javac.compilerargs=-Xlint -Xlint:-serial
|
javac.compilerargs=-Xlint -Xlint:-serial
|
||||||
|
@ -53,17 +53,13 @@
|
|||||||
</module-dependencies>
|
</module-dependencies>
|
||||||
<public-packages/>
|
<public-packages/>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/tika-core-1.1.jar</runtime-relative-path>
|
<runtime-relative-path>ext/tika-core-0.10.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/tika-core-1.1.jar</binary-origin>
|
<binary-origin>release/modules/ext/tika-core-0.10.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
<class-path-extension>
|
||||||
<runtime-relative-path>ext/commons-lang-2.4.jar</runtime-relative-path>
|
<runtime-relative-path>ext/commons-lang-2.4.jar</runtime-relative-path>
|
||||||
<binary-origin>release/modules/ext/commons-lang-2.4.jar</binary-origin>
|
<binary-origin>release/modules/ext/commons-lang-2.4.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
<class-path-extension>
|
|
||||||
<runtime-relative-path>ext/tika-parsers-1.1.jar</runtime-relative-path>
|
|
||||||
<binary-origin>release/modules/ext/tika-parsers-1.1.jar</binary-origin>
|
|
||||||
</class-path-extension>
|
|
||||||
</data>
|
</data>
|
||||||
</configuration>
|
</configuration>
|
||||||
</project>
|
</project>
|
||||||
|
BIN
thunderbirdparser/release/modules/ext/tika-core-0.10.jar
Normal file
BIN
thunderbirdparser/release/modules/ext/tika-core-0.10.jar
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -17,7 +17,7 @@ import org.apache.tika.metadata.*;
|
|||||||
* @author arivera
|
* @author arivera
|
||||||
*/
|
*/
|
||||||
public class ThunderbirdMetadata implements CreativeCommons, DublinCore, Geographic, HttpHeaders,
|
public class ThunderbirdMetadata implements CreativeCommons, DublinCore, Geographic, HttpHeaders,
|
||||||
IPTC, Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys,
|
Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys,
|
||||||
Serializable {
|
Serializable {
|
||||||
|
|
||||||
private int strArrCount = 0;
|
private int strArrCount = 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user