diff --git a/KeywordSearch/ivy.xml b/KeywordSearch/ivy.xml
index 3d0a84af01..fe66fa16ed 100644
--- a/KeywordSearch/ivy.xml
+++ b/KeywordSearch/ivy.xml
@@ -16,6 +16,7 @@
+
diff --git a/KeywordSearch/nbproject/project.xml b/KeywordSearch/nbproject/project.xml
index f32685fca2..2d4f8391e4 100644
--- a/KeywordSearch/nbproject/project.xml
+++ b/KeywordSearch/nbproject/project.xml
@@ -160,6 +160,10 @@
ext/commons-httpclient-3.1.jar
release/modules/ext/commons-httpclient-3.1.jar
+
+ ext/tika-core-0.10.jar
+ release/modules/ext/tika-core-0.10.jar
+
ext/commons-codec-1.5.jar
release/modules/ext/commons-codec-1.5.jar
@@ -168,6 +172,10 @@
ext/commons-lang-2.4.jar
release/modules/ext/commons-lang-2.4.jar
+
+ ext/tika-parsers-0.10.jar
+ release/modules/ext/tika-parsers-0.10.jar
+
ext/jcl-over-slf4j-1.6.1.jar
release/modules/ext/jcl-over-slf4j-1.6.1.jar
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java
new file mode 100644
index 0000000000..a52be7e39d
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java
@@ -0,0 +1,66 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2012 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
+
+/**
+ * Represents each string chunk to be indexed, a derivative of AbstractFileExtract file
+ */
+class AbstractFileChunk {
+ private int chunkID;
+ private AbstractFileExtract parent;
+
+ AbstractFileChunk(AbstractFileExtract parent, int chunkID) {
+ this.parent = parent;
+ this.chunkID = chunkID;
+ }
+
+ public AbstractFileExtract getParent() {
+ return parent;
+ }
+
+ public int getChunkId() {
+ return chunkID;
+ }
+
+ /**
+ * return String representation of the absolute id (parent and child)
+ *
+ * @return
+ */
+ public String getIdString() {
+ return Server.getChunkIdString(this.parent.getSourceFile().getId(), this.chunkID);
+ }
+
+ public boolean index(Ingester ingester, byte[] content, long contentSize, ByteContentStream.Encoding encoding) throws IngesterException {
+ boolean success = true;
+ ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), encoding);
+ try {
+ ingester.ingest(this, bcs);
+ //logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID);
+ } catch (Exception ingEx) {
+ success = false;
+ throw new IngesterException("Problem ingesting file string chunk: " + parent.getSourceFile().getId() + ", chunk: " + chunkID, ingEx);
+ }
+ return success;
+ }
+
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java
new file mode 100644
index 0000000000..e1501a8d34
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileExtract.java
@@ -0,0 +1,48 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2012 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.sleuthkit.autopsy.keywordsearch;
+
+import org.sleuthkit.datamodel.AbstractFile;
+
+/**
+ * Common methods for utilities that extract text and content and divide into
+ * chunks
+ */
+interface AbstractFileExtract {
+
+ /**
+ * Get number of chunks resulted from extracting this AbstractFile
+ * @return the number of chunks produced
+ */
+ int getNumChunks();
+
+ /**
+ * Get the source file associated with this extraction
+ * @return the source AbstractFile
+ */
+ AbstractFile getSourceFile();
+
+ /**
+ * Index the Abstract File
+ * @return true if indexed successfully, false otherwise
+ * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
+ */
+ boolean index() throws Ingester.IngesterException;
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java
index 5f8e1f7cd5..d67543ecb0 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringContentStream.java
@@ -24,8 +24,7 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.util.logging.Logger;
import org.apache.solr.common.util.ContentStream;
-import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream;
-import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream.Encoding;
+import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
import org.sleuthkit.datamodel.AbstractContent;
import org.sleuthkit.datamodel.AbstractFile;
@@ -43,7 +42,7 @@ public class AbstractFileStringContentStream implements ContentStream {
private AbstractFileStringStream stream;
private static Logger logger = Logger.getLogger(AbstractFileStringContentStream.class.getName());
- public AbstractFileStringContentStream(AbstractFile content, Encoding encoding) {
+ public AbstractFileStringContentStream(AbstractFile content, ByteContentStream.Encoding encoding) {
this.content = content;
this.encoding = encoding;
this.stream = new AbstractFileStringStream(content, encoding);
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
new file mode 100644
index 0000000000..72b30e49d7
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringExtract.java
@@ -0,0 +1,128 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2011 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
+import org.sleuthkit.datamodel.AbstractFile;
+
+
+
+
+
+/**
+ * Takes an AbstractFile, extract strings, converts into chunks (associated with the original
+ * source file) up to 1MB then and indexes chunks as text with Solr
+ */
+class AbstractFileStringExtract implements AbstractFileExtract {
+
+ private KeywordSearchIngestService service;
+ private Ingester ingester;
+ private int numChunks;
+ private static final Logger logger = Logger.getLogger(AbstractFileStringExtract.class.getName());
+ static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
+ private AbstractFile aFile;
+ //single static buffer for all extractions. Safe, indexing can only happen in one thread
+ private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
+ private static final int BOM_LEN = 3;
+
+ static {
+ //prepend UTF-8 BOM to start of the buffer
+ STRING_CHUNK_BUF[0] = (byte) 0xEF;
+ STRING_CHUNK_BUF[1] = (byte) 0xBB;
+ STRING_CHUNK_BUF[2] = (byte) 0xBF;
+ }
+
+ public AbstractFileStringExtract(AbstractFile aFile) {
+ this.aFile = aFile;
+ numChunks = 0; //unknown until indexing is done
+ this.service = KeywordSearchIngestService.getDefault();
+ Server solrServer = KeywordSearch.getServer();
+ ingester = solrServer.getIngester();
+ }
+
+ @Override
+ public int getNumChunks() {
+ return this.numChunks;
+ }
+
+ @Override
+ public AbstractFile getSourceFile() {
+ return aFile;
+ }
+
+ @Override
+ public boolean index() throws IngesterException {
+ boolean success = false;
+
+ //construct stream that extracts text as we read it
+ final InputStream stringStream = new AbstractFileStringStream(aFile, ByteContentStream.Encoding.UTF8);
+
+ try {
+ success = true;
+ //break input stream into chunks
+
+ long readSize = 0;
+ while ((readSize = stringStream.read(STRING_CHUNK_BUF, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
+ //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
+ //debug.write(STRING_CHUNK_BUF, 0, (int)readSize);
+
+ AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
+
+ try {
+ chunk.index(ingester, STRING_CHUNK_BUF, readSize + BOM_LEN, ByteContentStream.Encoding.UTF8);
+ ++this.numChunks;
+ } catch (IngesterException ingEx) {
+ success = false;
+ logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ingEx);
+ throw ingEx; //need to rethrow/return to signal error and move on
+ }
+
+ //check if need invoke commit/search between chunks
+ //not to delay commit if timer has gone off
+ service.checkRunCommitSearch();
+
+ //debug.close();
+ }
+
+
+ //after all chunks, ingest the parent file without content itself, and store numChunks
+ ingester.ingest(this);
+
+ } catch (IOException ex) {
+ logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + aFile.getName(), ex);
+ success = false;
+ } finally {
+ try {
+ stringStream.close();
+ } catch (IOException ex) {
+ logger.log(Level.WARNING, "Error closing input stream stream, file: " + aFile.getName(), ex);
+ }
+ }
+
+
+ return success;
+ }
+
+
+}
+
diff --git a/DataModel/src/org/sleuthkit/autopsy/datamodel/AbstractFileStringStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java
similarity index 96%
rename from DataModel/src/org/sleuthkit/autopsy/datamodel/AbstractFileStringStream.java
rename to KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java
index ec0a5330cc..a12ee984e5 100644
--- a/DataModel/src/org/sleuthkit/autopsy/datamodel/AbstractFileStringStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileStringStream.java
@@ -16,13 +16,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.sleuthkit.autopsy.datamodel;
+package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.logging.Level;
import java.util.logging.Logger;
+import org.sleuthkit.autopsy.datamodel.DataConversion;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskException;
@@ -33,16 +34,6 @@ import org.sleuthkit.datamodel.TskException;
*/
public class AbstractFileStringStream extends InputStream {
- public static enum Encoding {
-
- UTF8 {
-
- @Override
- public String toString() {
- return "UTF-8";
- }
- },
- };
//args
private AbstractFile content;
@@ -73,7 +64,7 @@ public class AbstractFileStringStream extends InputStream {
* @param encoding target encoding, currently UTF-8
* @param preserveOnBuffBoundary whether to preserve or split string on a buffer boundary. If false, will pack into read buffer up to max. possible, potentially splitting a string. If false, the string will be preserved for next read.
*/
- public AbstractFileStringStream(AbstractFile content, Encoding encoding, boolean preserveOnBuffBoundary) {
+ public AbstractFileStringStream(AbstractFile content, ByteContentStream.Encoding encoding, boolean preserveOnBuffBoundary) {
this.content = content;
this.encoding = encoding.toString();
//this.preserveOnBuffBoundary = preserveOnBuffBoundary;
@@ -87,7 +78,7 @@ public class AbstractFileStringStream extends InputStream {
* @param content to extract strings from
* @param encoding target encoding, currently UTF-8
*/
- public AbstractFileStringStream(AbstractFile content, Encoding encoding) {
+ public AbstractFileStringStream(AbstractFile content, ByteContentStream.Encoding encoding) {
this(content, encoding, false);
}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
new file mode 100644
index 0000000000..df3ba7cb1a
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
@@ -0,0 +1,149 @@
+/*
+ * Autopsy Forensic Browser
+ *
+ * Copyright 2012 Basis Technology Corp.
+ * Contact: carrier sleuthkit org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.sleuthkit.autopsy.ingest.IngestServiceAbstractFile;
+import org.sleuthkit.datamodel.AbstractFile;
+import org.sleuthkit.datamodel.ReadContentInputStream;
+import org.apache.tika.Tika;
+import org.sleuthkit.autopsy.keywordsearch.ByteContentStream.Encoding;
+
+/**
+ * Extractor of text from TIKA supported AbstractFile content. Extracted text is
+ * divided into chunks and indexed with Solr.
+ *
+ * This is especially useful for large content of supported type that is to be
+ * divided into text chunks and indexed as such.
+ *
+ */
+public class AbstractFileTikaTextExtract implements AbstractFileExtract {
+
+ private static final Logger logger = Logger.getLogger(IngestServiceAbstractFile.class.getName());
+ private static final Encoding ENCODING = Encoding.UTF8;
+ static final Charset charset = Charset.forName(ENCODING.toString());
+ static final int MAX_EXTR_TEXT_CHUNK_SIZE = 1 * 1024 * 1024;
+ private static final char[] TEXT_CHUNK_BUF = new char[MAX_EXTR_TEXT_CHUNK_SIZE];
+ private static final Tika tika = new Tika();
+ private KeywordSearchIngestService service;
+ private Ingester ingester;
+ private AbstractFile sourceFile;
+ private int numChunks = 0;
+ private static final String UTF16BOM = "\uFEFF";
+
+ AbstractFileTikaTextExtract(AbstractFile sourceFile) {
+ this.sourceFile = sourceFile;
+ this.service = KeywordSearchIngestService.getDefault();
+ Server solrServer = KeywordSearch.getServer();
+ ingester = solrServer.getIngester();
+ }
+
+ @Override
+ public int getNumChunks() {
+ return numChunks;
+ }
+
+ @Override
+ public AbstractFile getSourceFile() {
+ return sourceFile;
+ }
+
+ @Override
+ public boolean index() throws Ingester.IngesterException {
+ boolean success = false;
+ Reader reader = null;
+ try {
+ success = true;
+ reader = tika.parse(new ReadContentInputStream(sourceFile));
+ long readSize;
+ while ((readSize = reader.read(TEXT_CHUNK_BUF, 0, MAX_EXTR_TEXT_CHUNK_SIZE)) != -1) {
+
+ //encode to bytes to index as byte stream
+ String extracted;
+ if (readSize < MAX_EXTR_TEXT_CHUNK_SIZE) {
+ //trim the 0 bytes
+ StringBuilder sb = new StringBuilder((int) readSize + 5);
+ //inject BOM here (saves byte buffer realloc), will be converted to specific encoding BOM
+ sb.append(UTF16BOM);
+ sb.append(TEXT_CHUNK_BUF, 0, (int) readSize);
+ extracted = sb.toString();
+
+ } else {
+ StringBuilder sb = new StringBuilder((int) readSize + 5);
+ //inject BOM here (saves byte buffer realloc), will be converted to specific encoding BOM
+ sb.append(UTF16BOM);
+ sb.append(TEXT_CHUNK_BUF);
+ extracted = sb.toString();
+ }
+ //converts BOM automatically to charSet encoding
+ byte[] encodedBytes = extracted.getBytes(charset);
+
+
+ //PrintStream s = new PrintStream("c:\\temp\\ps.txt");
+ //for (byte b : encodedBytes) {
+ // s.format("%02x ", b);
+ //}
+ //s.close();
+
+ //debug
+ //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks + 1));
+ //debug.write(encodedBytes, 0, encodedBytes.length);
+ //debug.close();
+
+ AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
+
+ try {
+ chunk.index(ingester, encodedBytes, encodedBytes.length, ENCODING);
+ ++this.numChunks;
+ } catch (Ingester.IngesterException ingEx) {
+ success = false;
+ logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '"
+ + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
+ throw ingEx; //need to rethrow/return to signal error and move on
+ }
+
+ //check if need invoke commit/search between chunks
+ //not to delay commit if timer has gone off
+ service.checkRunCommitSearch();
+
+ }
+
+ } catch (IOException ex) {
+ logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId(), ex);
+ } finally {
+ try {
+ reader.close();
+ } catch (IOException ex) {
+ logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex);
+ }
+ }
+
+ //after all chunks, ingest the parent file without content itself, and store numChunks
+ ingester.ingest(this);
+
+ return success;
+
+ }
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
index e0d23992cb..994ced04db 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java
@@ -25,15 +25,32 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.util.logging.Logger;
import org.apache.solr.common.util.ContentStream;
-import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream.Encoding;
import org.sleuthkit.datamodel.AbstractContent;
-import org.sleuthkit.datamodel.FsContent;
/**
* Stream of bytes representing string with specified encoding
* to feed into Solr as ContentStream
*/
public class ByteContentStream implements ContentStream {
+
+ public static enum Encoding {
+
+ UTF8 {
+
+ @Override
+ public String toString() {
+ return "UTF-8";
+ }
+ },
+ UTF16 {
+
+ @Override
+ public String toString() {
+ return "UTF-16";
+ }
+ },
+ };
+
//input
private byte[] content; //extracted subcontent
private long contentSize;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileExtract.java
deleted file mode 100644
index ecb86aa97c..0000000000
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileExtract.java
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Autopsy Forensic Browser
- *
- * Copyright 2011 Basis Technology Corp.
- * Contact: carrier sleuthkit org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package org.sleuthkit.autopsy.keywordsearch;
-
-import java.io.IOException;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import org.sleuthkit.autopsy.datamodel.AbstractFileStringStream;
-import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
-import org.sleuthkit.datamodel.AbstractFile;
-
-
-/**
- * Utility to extract strings and index a file with string content as chunks
- * associated with the original parent file
- */
-class FileExtract {
-
- KeywordSearchIngestService service;
- private int numChunks;
- private static final Logger logger = Logger.getLogger(FileExtract.class.getName());
- static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
- private AbstractFile sourceFile;
-
- //single static buffer for all extractions. Safe, indexing can only happen in one thread
- private static final byte[] STRING_CHUNK_BUF = new byte[(int) MAX_STRING_CHUNK_SIZE];
- private static final int BOM_LEN = 3;
- static {
- //prepend UTF-8 BOM to start of the buffer
- STRING_CHUNK_BUF[0] = (byte)0xEF;
- STRING_CHUNK_BUF[1] = (byte)0xBB;
- STRING_CHUNK_BUF[2] = (byte)0xBF;
- }
-
- public FileExtract(KeywordSearchIngestService service, AbstractFile sourceFile) {
- this.service = service;
- this.sourceFile = sourceFile;
- numChunks = 0; //unknown until indexing is done
- }
-
- public int getNumChunks() {
- return this.numChunks;
- }
-
- public AbstractFile getSourceFile() {
- return sourceFile;
- }
-
-
- public boolean index(Ingester ingester) throws IngesterException {
- boolean success = false;
-
- AbstractFileStringStream stringStream = null;
- try {
- success = true;
- //break string into chunks
- //Note: could use DataConversion.toString() since we are operating on fixed chunks
- //but FsContentStringStream handles string boundary case better
- stringStream = new AbstractFileStringStream(sourceFile, AbstractFileStringStream.Encoding.UTF8);
- long readSize = 0;
-
- while ((readSize = stringStream.read(STRING_CHUNK_BUF, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
- //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
- //debug.write(STRING_CHUNK_BUF, 0, (int)readSize);
-
- FileExtractedChild chunk = new FileExtractedChild(this, this.numChunks + 1);
-
- try {
- chunk.index(ingester, STRING_CHUNK_BUF, readSize + BOM_LEN);
- ++this.numChunks;
- } catch (IngesterException ingEx) {
- success = false;
- logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx);
- throw ingEx; //need to rethrow/return to signal error and move on
- }
-
- //check if need invoke commit/search between chunks
- //not to delay commit if timer has gone off
- service.checkRunCommitSearch();
-
- //debug.close();
- }
-
-
- //after all chunks, ingest the parent file without content itself, and store numChunks
- ingester.ingest(this);
-
- } catch (IOException ex) {
- logger.log(Level.WARNING, "Unable to read string stream and send to Solr, file: " + sourceFile.getName(), ex);
- success = false;
- } finally {
- if (stringStream != null) {
- try {
- stringStream.close();
- } catch (IOException ex) {
- logger.log(Level.WARNING, "Error closing string stream, file: " + sourceFile.getName(), ex);
- }
- }
- }
-
-
- return success;
- }
-}
-/**
- * Represents each string chunk to be indexed, a child of FileExtracted file
- */
-class FileExtractedChild {
-
- private int chunkID;
- private FileExtract parent;
-
- FileExtractedChild(FileExtract parent, int chunkID) {
- this.parent = parent;
- this.chunkID = chunkID;
- }
-
- public FileExtract getParentFile() {
- return parent;
- }
-
- public int getChunkId() {
- return chunkID;
- }
-
- /**
- * return String representation of the absolute id (parent and child)
- * @return
- */
- public String getIdString() {
- return getFileExtractChildId(this.parent.getSourceFile().getId(), this.chunkID);
- }
-
-
- public boolean index(Ingester ingester, byte[] content, long contentSize) throws IngesterException {
- boolean success = true;
- ByteContentStream bcs = new ByteContentStream(content, contentSize, parent.getSourceFile(), AbstractFileStringStream.Encoding.UTF8);
- try {
- ingester.ingest(this, bcs);
- //logger.log(Level.INFO, "Ingesting string chunk: " + this.getName() + ": " + chunkID);
-
- } catch (Exception ingEx) {
- success = false;
- throw new IngesterException("Problem ingesting file string chunk: " + parent.getSourceFile().getId() + ", chunk: " + chunkID, ingEx);
- }
- return success;
- }
-
- public static String getFileExtractChildId(long parentID, int childID) {
- return Long.toString(parentID) + Server.ID_CHUNK_SEP + Integer.toString(childID);
- }
-}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index d95a8f6649..2492288779 100755
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -73,6 +73,17 @@ public class Ingester {
"pst", "xml", "class", "dwg", "eml", "emlx", "mbox", "mht"};
+ private static Ingester instance;
+
+ private Ingester() {
+
+ }
+
+ public static synchronized Ingester getDefault() {
+ if (instance == null)
+ instance = new Ingester();
+ return instance;
+ }
@Override
@SuppressWarnings("FinalizeDeclaration")
@@ -99,16 +110,16 @@ public class Ingester {
}
/**
- * Sends a FileExtract to Solr to have its content extracted and added to the
+ * Sends a AbstractFileExtract to Solr to have its content extracted and added to the
* index. commit() should be called once you're done ingesting files.
* FileExtract represents a parent of extracted file with actual content.
- * The parent itself has no content, only meta data and is used to associate the extracted FileExtractedChild
+ * The parent itself has no content, only meta data and is used to associate the extracted AbstractFileChunk
*
- * @param fe FileExtract to ingest
+ * @param fe AbstractFileExtract to ingest
* @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine.
*/
- void ingest(FileExtract fe) throws IngesterException {
+ void ingest(AbstractFileExtract fe) throws IngesterException {
Map params = getContentFields(fe.getSourceFile());
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
@@ -117,23 +128,23 @@ public class Ingester {
}
/**
- * Sends a FileExtractedChild to Solr and its extracted content stream to be added to the
+ * Sends a AbstractFileChunk to Solr and its extracted content stream to be added to the
* index. commit() should be called once you're done ingesting files.
- * FileExtractedChild represents a file chunk and its chunk content.
+ * AbstractFileChunk represents a file chunk and its chunk content.
*
- * @param fec FileExtractedChild to ingest
+ * @param fec AbstractFileChunk to ingest
* @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine.
*/
- void ingest(FileExtractedChild fec, ByteContentStream bcs) throws IngesterException {
+ void ingest(AbstractFileChunk fec, ByteContentStream bcs) throws IngesterException {
AbstractContent sourceContent = bcs.getSourceContent();
Map params = getContentFields(sourceContent);
//overwrite id with the chunk id
params.put(Server.Schema.ID.toString(),
- FileExtractedChild.getFileExtractChildId(sourceContent.getId(), fec.getChunkId()));
+ Server.getChunkIdString(sourceContent.getId(), fec.getChunkId()));
- ingest(bcs, params, FileExtract.MAX_STRING_CHUNK_SIZE);
+ ingest(bcs, params, AbstractFileStringExtract.MAX_STRING_CHUNK_SIZE);
}
/**
@@ -448,8 +459,9 @@ public class Ingester {
*/
static boolean isIngestible(AbstractFile aFile) {
TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
- if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) )
+ if (! aType.equals(TSK_DB_FILES_TYPE_ENUM.FS) ) {
return false;
+ }
FsContent fsContent = (FsContent) aFile;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
index d3ff1b367b..5ad5c234bf 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestService.java
@@ -20,6 +20,7 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
@@ -40,7 +41,6 @@ import org.netbeans.api.progress.ProgressHandleFactory;
import org.openide.util.Cancellable;
import org.openide.util.Exceptions;
import org.sleuthkit.autopsy.casemodule.Case;
-import org.sleuthkit.autopsy.ingest.IngestManager;
import org.sleuthkit.autopsy.ingest.IngestManagerProxy;
import org.sleuthkit.autopsy.ingest.IngestMessage;
import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
@@ -95,7 +95,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
private final String hashDBServiceName = "Hash Lookup"; //NOTE this needs to match the HashDB service getName()
private SleuthkitCase caseHandle = null;
private boolean skipKnown = true;
- boolean initialized = false;
+ private boolean initialized = false;
private enum IngestStatus {
@@ -200,6 +200,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
managerProxy.postMessage(IngestMessage.createMessage(++messageID, MessageType.INFO, this, "Completed"));
}
+
//postSummary();
}
@@ -224,6 +225,7 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
runSearcher = false;
finalSearcherDone = true;
+
//commit uncommited files, don't search again
commit();
@@ -498,16 +500,27 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
private final Logger logger = Logger.getLogger(Indexer.class.getName());
- private boolean extractAndIngest(AbstractFile aFile) {
- boolean indexed = false;
- final FileExtract fe = new FileExtract(KeywordSearchIngestService.this, aFile);
- try {
- indexed = fe.index(ingester);
- } catch (IngesterException ex) {
- logger.log(Level.WARNING, "Error extracting strings and indexing file: " + aFile.getName(), ex);
- indexed = false;
+ /**
+ * Extract strings or text with Tika (by streaming) from the file Divide
+ * the file into chunks and index the chunks
+ *
+ * @param aFile file to extract strings from, divide into chunks and
+ * index
+ * @param stringsOnly true if use stinrg extraction, false if use Tika
+ * text extractor
+ * @return true if the file was indexed, false otherwise
+ */
+ private boolean extractIndex(AbstractFile aFile, boolean stringsOnly) throws IngesterException {
+ AbstractFileExtract fileExtract;
+
+ if (stringsOnly) {
+ fileExtract = new AbstractFileStringExtract(aFile);
+ } else {
+ fileExtract = new AbstractFileTikaTextExtract(aFile);
}
- return indexed;
+
+ //divide into chunks and index
+ return fileExtract.index();
}
private void indexFile(AbstractFile aFile, boolean indexContent) {
@@ -537,9 +550,8 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
boolean ingestibleFile = Ingester.isIngestible(aFile);
final long size = aFile.getSize();
- //if fs file, limit size of entire file, do not limit strings
- if (fsContent != null && (size == 0 || (ingestibleFile && size > MAX_INDEX_SIZE))) {
- //if fs file, index meta only, otherwise if unalloc, skip
+ //if fs file with no content (size is 0), index meta-data only
+ if (fsContent != null && size == 0) {
try {
ingester.ingest(fsContent, false); //meta-data only
ingestStatus.put(aFile.getId(), IngestStatus.INGESTED_META);
@@ -548,15 +560,21 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
logger.log(Level.WARNING, "Unable to index meta-data for fsContent: " + fsContent.getId(), ex);
}
- return;
- }
-
- if (fsContent != null && ingestibleFile == true) {
- //we know it's an allocated fs file (FsContent) with supported content
+ } else if (fsContent != null && ingestibleFile == true) {
+ //we know it's an allocated fs file (FsContent) with supported content
+ //extract text with Tika, divide into chunks and index with Solr
try {
//logger.log(Level.INFO, "indexing: " + fsContent.getName());
- ingester.ingest(fsContent, true);
- ingestStatus.put(fsContent.getId(), IngestStatus.INGESTED);
+ //ingester.ingest(fsContent, true);
+ if (!extractIndex(aFile, false)) {
+ logger.log(Level.WARNING, "Failed to extract Tika text and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
+ ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+
+ } else {
+ ingestStatus.put(aFile.getId(), IngestStatus.INGESTED);
+
+ }
+
} catch (IngesterException e) {
ingestStatus.put(fsContent.getId(), IngestStatus.SKIPPED);
//try to extract strings, if a file
@@ -578,13 +596,19 @@ public final class KeywordSearchIngestService implements IngestServiceAbstractFi
}
private boolean processNonIngestible(AbstractFile aFile) {
- if (!extractAndIngest(aFile)) {
- logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
+ try {
+ if (!extractIndex(aFile, true)) {
+ logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").");
+ ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
+ return false;
+ } else {
+ ingestStatus.put(aFile.getId(), IngestStatus.EXTRACTED_INGESTED);
+ return true;
+ }
+ } catch (IngesterException ex) {
+ logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex);
ingestStatus.put(aFile.getId(), IngestStatus.SKIPPED);
return false;
- } else {
- ingestStatus.put(aFile.getId(), IngestStatus.EXTRACTED_INGESTED);
- return true;
}
}
}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
index c22ea8a945..f29f5a8150 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/LuceneQuery.java
@@ -326,7 +326,7 @@ public class LuceneQuery implements KeywordSearchQuery {
if (chunkID == 0) {
contentIDStr = Long.toString(contentID);
} else {
- contentIDStr = FileExtractedChild.getFileExtractChildId(contentID, chunkID);
+ contentIDStr = Server.getChunkIdString(contentID, chunkID);
}
String idQuery = Server.Schema.ID.toString() + ":" + contentIDStr;
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
index 4b72868166..b107d682fb 100755
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java
@@ -60,74 +60,63 @@ class Server {
public static enum Schema {
ID {
-
@Override
public String toString() {
return "id";
}
},
CONTENT {
-
@Override
public String toString() {
return "content";
}
},
CONTENT_WS {
-
@Override
public String toString() {
return "content_ws";
}
},
FILE_NAME {
-
@Override
public String toString() {
return "file_name";
}
},
CTIME {
-
@Override
public String toString() {
return "ctime";
}
},
ATIME {
-
@Override
public String toString() {
return "atime";
}
},
MTIME {
-
@Override
public String toString() {
return "mtime";
}
},
CRTIME {
-
@Override
public String toString() {
return "crtime";
}
},
NUM_CHUNKS {
-
@Override
public String toString() {
return "num_chunks";
}
- },};
-
+ },
+ };
public static final String HL_ANALYZE_CHARS_UNLIMITED = "-1";
-
//max content size we can send to Solr
public static final long MAX_CONTENT_SIZE = 1L * 1024 * 1024 * 1024;
-
private static final Logger logger = Logger.getLogger(Server.class.getName());
private static final String DEFAULT_CORE_NAME = "coreCase";
// TODO: DEFAULT_CORE_NAME needs to be replaced with unique names to support multiple open cases
@@ -136,6 +125,8 @@ class Server {
private String javaPath = "java";
private static final int MAX_SOLR_MEM_MB = 512; //TODO set dynamically based on avail. system resources
private Process curSolrProcess = null;
+
+ private static Ingester ingester = null;
public enum CORE_EVT_STATES {
@@ -148,6 +139,7 @@ class Server {
/**
* New instance for the server at the given URL
+ *
* @param url should be something like "http://localhost:8983/solr/"
*/
Server(String url) {
@@ -220,7 +212,7 @@ class Server {
bw.newLine();
if (Version.getBuildType() == Version.Type.DEVELOPMENT) {
//flush buffers if dev version for debugging
- bw.flush();
+ bw.flush();
}
}
} catch (IOException ex) {
@@ -237,7 +229,7 @@ class Server {
void start() {
logger.log(Level.INFO, "Starting Solr server from: " + solrFolder.getAbsolutePath());
try {
- final String MAX_SOLR_MEM_MB_PAR = " -Xmx" + Integer.toString(MAX_SOLR_MEM_MB) + "m";
+ final String MAX_SOLR_MEM_MB_PAR = " -Xmx" + Integer.toString(MAX_SOLR_MEM_MB) + "m";
final String SOLR_START_CMD = javaPath + MAX_SOLR_MEM_MB_PAR + " -DSTOP.PORT=8079 -DSTOP.KEY=mysecret -jar start.jar";
logger.log(Level.INFO, "Starting Solr using: " + SOLR_START_CMD);
curSolrProcess = Runtime.getRuntime().exec(SOLR_START_CMD, null, solrFolder);
@@ -259,9 +251,8 @@ class Server {
/**
* Tries to stop a Solr instance.
- *
- * Waits for the stop command to finish
- * before returning.
+ *
+ * Waits for the stop command to finish before returning.
*/
synchronized void stop() {
try {
@@ -283,8 +274,11 @@ class Server {
}
/**
- * Tests if there's a Solr server running by sending it a core-status request.
- * @return false if the request failed with a connection error, otherwise true
+ * Tests if there's a Solr server running by sending it a core-status
+ * request.
+ *
+ * @return false if the request failed with a connection error, otherwise
+ * true
*/
synchronized boolean isRunning() {
@@ -311,7 +305,9 @@ class Server {
return true;
}
- /**** Convenience methods for use while we only open one case at a time ****/
+ /**
+ * ** Convenience methods for use while we only open one case at a time ***
+ */
private volatile Core currentCore = null;
synchronized void openCore() {
@@ -331,11 +327,14 @@ class Server {
serverAction.putValue(CORE_EVT, CORE_EVT_STATES.STOPPED);
}
- /**** end single-case specific methods ****/
+ /**
+ * ** end single-case specific methods ***
+ */
/**
* Open a core for the given case
+ *
* @param c
- * @return
+ * @return
*/
synchronized Core openCore(Case c) {
String sep = File.separator;
@@ -345,6 +344,7 @@ class Server {
/**
* commit current core if it exists
+ *
* @throws SolrServerException, NoOpenCoreException
*/
synchronized void commit() throws SolrServerException, NoOpenCoreException {
@@ -362,10 +362,12 @@ class Server {
}
/**
- * Execute query that gets only number of all Solr files indexed
- * without actually returning the files. The result does not include chunks, only number of actual files.
+ * Execute query that gets only number of all Solr files indexed without
+ * actually returning the files. The result does not include chunks, only
+ * number of actual files.
+ *
* @return int representing number of indexed files
- * @throws SolrServerException
+ * @throws SolrServerException
*/
public int queryNumIndexedFiles() throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -374,12 +376,13 @@ class Server {
return currentCore.queryNumIndexedFiles();
}
-
- /**
- * Execute query that gets only number of all Solr documents indexed (files and chunks)
- * without actually returning the documents
+
+ /**
+ * Execute query that gets only number of all Solr documents indexed (files
+ * and chunks) without actually returning the documents
+ *
* @return int representing number of indexed files (files and chunks)
- * @throws SolrServerException
+ * @throws SolrServerException
*/
public int queryNumIndexedDocuments() throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -391,6 +394,7 @@ class Server {
/**
* Return true if the file is indexed (either as a whole as a chunk)
+ *
* @param contentID
* @return true if it is indexed
* @throws SolrServerException, NoOpenCoreException
@@ -405,9 +409,11 @@ class Server {
/**
* Execute query that gets number of indexed file chunks for a file
+ *
* @param fileID file id of the original file broken into chunks and indexed
- * @return int representing number of indexed file chunks, 0 if there is no chunks
- * @throws SolrServerException
+ * @return int representing number of indexed file chunks, 0 if there is no
+ * chunks
+ * @throws SolrServerException
*/
public int queryNumFileChunks(long fileID) throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -419,10 +425,11 @@ class Server {
/**
* Execute solr query
+ *
* @param sq query
* @return query response
* @throws SolrServerException
- * @throws NoOpenCoreException
+ * @throws NoOpenCoreException
*/
public QueryResponse query(SolrQuery sq) throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -433,11 +440,12 @@ class Server {
/**
* Execute solr query
+ *
* @param sq the query
* @param method http method to use
* @return query response
* @throws SolrServerException
- * @throws NoOpenCoreException
+ * @throws NoOpenCoreException
*/
public QueryResponse query(SolrQuery sq, SolrRequest.METHOD method) throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -448,10 +456,11 @@ class Server {
/**
* Execute Solr terms query
+ *
* @param sq the query
* @return terms response
* @throws SolrServerException
- * @throws NoOpenCoreException
+ * @throws NoOpenCoreException
*/
public TermsResponse queryTerms(SolrQuery sq) throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -462,10 +471,11 @@ class Server {
/**
* Execute Solr query to get content text
+ *
* @param content to get the text for
* @return content text string
* @throws SolrServerException
- * @throws NoOpenCoreException
+ * @throws NoOpenCoreException
*/
public String getSolrContent(final Content content) throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -473,14 +483,16 @@ class Server {
}
return currentCore.getSolrContent(content.getId(), 0);
}
-
+
/**
* Execute Solr query to get content text from content chunk
+ *
* @param content to get the text for
- * @param chunkID chunk number to query (starting at 1), or 0 if there is no chunks for that content
+ * @param chunkID chunk number to query (starting at 1), or 0 if there is no
+ * chunks for that content
* @return content text string
* @throws SolrServerException
- * @throws NoOpenCoreException
+ * @throws NoOpenCoreException
*/
public String getSolrContent(final Content content, int chunkID) throws SolrServerException, NoOpenCoreException {
if (currentCore == null) {
@@ -490,15 +502,28 @@ class Server {
}
/**
- * factory method to create ingester
- * @return ingester
+ * Method to return ingester instance
+ *
+ * @return ingester instance
*/
- public Ingester getIngester() {
- return new Ingester();
+ public static Ingester getIngester() {
+ return Ingester.getDefault();
+ }
+
+ /**
+ * Given file parent id and child chunk ID, return the ID string of the chunk
+ * as stored in Solr, e.g. FILEID_CHUNKID
+ * @param parentID the parent file id (id of the source content)
+ * @param childID the child chunk id
+ * @return formatted string id
+ */
+ public static String getChunkIdString(long parentID, int childID) {
+ return Long.toString(parentID) + Server.ID_CHUNK_SEP + Integer.toString(childID);
}
/**
* Open a new core
+ *
* @param coreName name to refer to the core by in Solr
* @param dataDir directory to load/store the core data from/to
* @return new core
@@ -574,13 +599,13 @@ class Server {
}
}
-
- private String getSolrContent(long contentID, int chunkID) {
+ private String getSolrContent(long contentID, int chunkID) {
final SolrQuery q = new SolrQuery();
q.setQuery("*:*");
String filterQuery = Schema.ID.toString() + ":" + contentID;
- if (chunkID != 0)
+ if (chunkID != 0) {
filterQuery = filterQuery + Server.ID_CHUNK_SEP + chunkID;
+ }
q.addFilterQuery(filterQuery);
q.setFields(Schema.CONTENT.toString());
try {
@@ -602,11 +627,12 @@ class Server {
}
/**
- * Execute query that gets only number of all Solr files (not chunks) indexed
- * without actually returning the files
- *
- * @return int representing number of indexed files (entire files, not chunks)
- * @throws SolrServerException
+ * Execute query that gets only number of all Solr files (not chunks)
+ * indexed without actually returning the files
+ *
+ * @return int representing number of indexed files (entire files, not
+ * chunks)
+ * @throws SolrServerException
*/
private int queryNumIndexedFiles() throws SolrServerException {
SolrQuery q = new SolrQuery(Server.Schema.ID + ":*" + Server.ID_CHUNK_SEP + "*");
@@ -614,14 +640,15 @@ class Server {
int numChunks = (int) query(q).getResults().getNumFound();
return queryNumIndexedDocuments() - numChunks;
}
-
+
/**
* Execute query that gets only number of all Solr documents indexed
- * without actually returning the documents. Documents include entire indexed files
- * as well as chunks, which are treated as documents.
- *
- * @return int representing number of indexed documents (entire files and chunks)
- * @throws SolrServerException
+ * without actually returning the documents. Documents include entire
+ * indexed files as well as chunks, which are treated as documents.
+ *
+ * @return int representing number of indexed documents (entire files
+ * and chunks)
+ * @throws SolrServerException
*/
private int queryNumIndexedDocuments() throws SolrServerException {
SolrQuery q = new SolrQuery("*:*");
@@ -631,9 +658,10 @@ class Server {
/**
* Return true if the file is indexed (either as a whole as a chunk)
+ *
* @param contentID
* @return true if it is indexed
- * @throws SolrServerException
+ * @throws SolrServerException
*/
private boolean queryIsIndexed(long contentID) throws SolrServerException {
SolrQuery q = new SolrQuery("*:*");
@@ -645,12 +673,15 @@ class Server {
/**
* Execute query that gets number of indexed file chunks for a file
- * @param contentID file id of the original file broken into chunks and indexed
- * @return int representing number of indexed file chunks, 0 if there is no chunks
- * @throws SolrServerException
+ *
+ * @param contentID file id of the original file broken into chunks and
+ * indexed
+ * @return int representing number of indexed file chunks, 0 if there is
+ * no chunks
+ * @throws SolrServerException
*/
private int queryNumFileChunks(long contentID) throws SolrServerException {
- final SolrQuery q =
+ final SolrQuery q =
new SolrQuery(Server.Schema.ID + ":" + Long.toString(contentID) + Server.ID_CHUNK_SEP + "*");
q.setRows(0);
return (int) query(q).getResults().getNumFound();
diff --git a/thunderbirdparser/nbproject/project.properties b/thunderbirdparser/nbproject/project.properties
index ba51ec0265..83174ac5f0 100644
--- a/thunderbirdparser/nbproject/project.properties
+++ b/thunderbirdparser/nbproject/project.properties
@@ -1,5 +1,3 @@
file.reference.commons-lang-2.4.jar=release/modules/ext/commons-lang-2.4.jar
-file.reference.tika-core-1.1.jar=release/modules/ext/tika-core-1.1.jar
-file.reference.tika-parsers-1.1.jar=release/modules/ext/tika-parsers-1.1.jar
javac.source=1.6
javac.compilerargs=-Xlint -Xlint:-serial
diff --git a/thunderbirdparser/nbproject/project.xml b/thunderbirdparser/nbproject/project.xml
index ea2c04cf47..63116d7987 100644
--- a/thunderbirdparser/nbproject/project.xml
+++ b/thunderbirdparser/nbproject/project.xml
@@ -53,17 +53,13 @@
- ext/tika-core-1.1.jar
- release/modules/ext/tika-core-1.1.jar
+ ext/tika-core-0.10.jar
+ release/modules/ext/tika-core-0.10.jar
ext/commons-lang-2.4.jar
release/modules/ext/commons-lang-2.4.jar
-
- ext/tika-parsers-1.1.jar
- release/modules/ext/tika-parsers-1.1.jar
-
diff --git a/thunderbirdparser/release/modules/ext/tika-core-0.10.jar b/thunderbirdparser/release/modules/ext/tika-core-0.10.jar
new file mode 100644
index 0000000000..78087e1381
Binary files /dev/null and b/thunderbirdparser/release/modules/ext/tika-core-0.10.jar differ
diff --git a/thunderbirdparser/release/modules/ext/tika-core-1.1.jar b/thunderbirdparser/release/modules/ext/tika-core-1.1.jar
deleted file mode 100644
index 7ad2be62be..0000000000
Binary files a/thunderbirdparser/release/modules/ext/tika-core-1.1.jar and /dev/null differ
diff --git a/thunderbirdparser/release/modules/ext/tika-parsers-1.1.jar b/thunderbirdparser/release/modules/ext/tika-parsers-1.1.jar
deleted file mode 100644
index ad82942ae3..0000000000
Binary files a/thunderbirdparser/release/modules/ext/tika-parsers-1.1.jar and /dev/null differ
diff --git a/thunderbirdparser/src/org/sleuthkit/autopsy/thunderbirdparser/ThunderbirdMetadata.java b/thunderbirdparser/src/org/sleuthkit/autopsy/thunderbirdparser/ThunderbirdMetadata.java
index 0ecb09215e..133e333140 100644
--- a/thunderbirdparser/src/org/sleuthkit/autopsy/thunderbirdparser/ThunderbirdMetadata.java
+++ b/thunderbirdparser/src/org/sleuthkit/autopsy/thunderbirdparser/ThunderbirdMetadata.java
@@ -17,7 +17,7 @@ import org.apache.tika.metadata.*;
* @author arivera
*/
public class ThunderbirdMetadata implements CreativeCommons, DublinCore, Geographic, HttpHeaders,
- IPTC, Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys,
+ Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys,
Serializable {
private int strArrCount = 0;