Merge in develop branch with text extraction refactoring

This commit is contained in:
Richard Cordovano 2017-01-08 10:48:17 -05:00
commit be7bdced90
18 changed files with 1214 additions and 1892 deletions

View File

@ -1,91 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.nio.charset.Charset;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
/**
* A representation of a chunk of text from a file that can be used, when
* supplied with an Ingester, to index the chunk for search.
*/
final class AbstractFileChunk {
private final int chunkNumber;
private final TextExtractor textExtractor;
/**
* Constructs a representation of a chunk of text from a file that can be
* used, when supplied with an Ingester, to index the chunk for search.
*
* @param textExtractor A TextExtractor for the file.
* @param chunkNumber A sequence number for the chunk.
*/
AbstractFileChunk(TextExtractor textExtractor, int chunkNumber) {
this.textExtractor = textExtractor;
this.chunkNumber = chunkNumber;
}
/**
* Gets the TextExtractor for the source file of the text chunk.
*
* @return A reference to the TextExtractor.
*/
TextExtractor getTextExtractor() {
return textExtractor;
}
/**
* Gets the sequence number of the text chunk.
*
* @return The chunk number.
*/
int getChunkNumber() {
return chunkNumber;
}
/**
* Gets the id of the text chunk.
*
* @return An id of the form [source file object id]_[chunk number]
*/
String getChunkId() {
return Server.getChunkIdString(this.textExtractor.getSourceFile().getId(), this.chunkNumber);
}
/**
* Indexes the text chunk.
*
* @param ingester An Ingester to do the indexing.
* @param chunkBytes The raw bytes of the text chunk.
* @param chunkSize The size of the text chunk in bytes.
* @param charSet The char set to use during indexing.
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
void index(Ingester ingester, byte[] chunkBytes, long chunkSize, Charset charSet) throws IngesterException {
ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, textExtractor.getSourceFile(), charSet);
try {
ingester.ingest(this, bcs, chunkBytes.length);
} catch (Exception ex) {
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", getChunkId()), ex);
}
}
}

View File

@ -1,92 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import org.openide.util.NbBundle;
import org.apache.solr.common.util.ContentStream;
import org.sleuthkit.datamodel.AbstractContent;
import org.sleuthkit.datamodel.AbstractFile;
/**
* Wrapper over InputStream that implements ContentStream to feed to Solr.
*/
class AbstractFileStringContentStream implements ContentStream {
//input
private final AbstractFile content;
private final Charset charset;
//converted
private final InputStream stream;
public AbstractFileStringContentStream(AbstractFile content, Charset charset, InputStream inputStream) {
this.content = content;
this.charset = charset;
this.stream = inputStream;
}
public AbstractContent getSourceContent() {
return content;
}
@Override
public String getContentType() {
return "text/plain;charset=" + charset.name(); //NON-NLS
}
@Override
public String getName() {
return content.getName();
}
@Override
public Reader getReader() throws IOException {
return new InputStreamReader(stream);
}
@Override
public Long getSize() {
//return convertedLength;
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSize.exception.msg"));
}
@Override
public String getSourceInfo() {
return NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSrcInfo.text", content.getId());
}
@Override
public InputStream getStream() throws IOException {
return stream;
}
@Override
protected void finalize() throws Throwable {
super.finalize();
stream.close();
}
}

View File

@ -1,213 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2012 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Wrapper over StringExtract to provide streaming API Given AbstractFile
* object, extract international strings from the file and read output as a
* stream of UTF-8 strings as encoded bytes.
*
*/
class AbstractFileStringIntStream extends InputStream {
private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
private static final int FILE_BUF_SIZE = 1024 * 1024;
private AbstractFile content;
private final byte[] oneCharBuf = new byte[1];
private final StringExtract stringExtractor;
private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
private long fileReadOffset = 0L;
private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
private int convertBuffOffset = 0; //offset to start returning data to user on next read()
private int bytesInConvertBuff = 0; //amount of data currently in the buffer
private boolean fileEOF = false; //if file has more bytes to read
private boolean extractUTF8;
private boolean extractUTF16;
private Charset outCharset;
private StringExtractResult lastExtractResult;
/**
* Constructs new stream object that does conversion from file, to extracted
* strings, then to byte stream, for specified script, auto-detected
* encoding (UTF8, UTF16LE, UTF16BE), and specified output byte stream
* encoding
*
* @param content input content to process and turn into a stream to
* convert into strings
* @param scripts a list of scripts to consider
* @param extractUTF8 whether to extract utf8 encoding
* @param extractUTF16 whether to extract utf16 encoding
* @param outCharset encoding to use in the output byte stream
*/
public AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8,
boolean extractUTF16, Charset outCharset) {
this.content = content;
this.stringExtractor = new StringExtract();
this.stringExtractor.setEnabledScripts(scripts);
this.extractUTF8 = extractUTF8;
this.extractUTF16 = extractUTF16;
this.outCharset = outCharset;
this.stringExtractor.setEnableUTF8(extractUTF8);
this.stringExtractor.setEnableUTF16(extractUTF16);
}
@Override
public int read() throws IOException {
if (extractUTF8 == false && extractUTF16 == false) {
return -1;
}
final int read = read(oneCharBuf, 0, 1);
if (read == 1) {
return oneCharBuf[0];
} else {
return -1;
}
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
if (extractUTF8 == false && extractUTF16 == false) {
return -1;
}
long fileSize = content.getSize();
if (fileSize == 0) {
return -1;
}
//read and convert until user buffer full
//we have data if file can be read or when byteBuff has converted strings to return
int bytesToUser = 0; //returned to user so far
int offsetUser = off;
while (bytesToUser < len && offsetUser < len) {
//check if we have enough converted strings
int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
try {
//convert more strings, store in buffer
long toRead = 0;
//int shiftSize = 0;
//if (lastExtractResult != null && lastExtractResult.getTextLength() != 0
// && (shiftSize = FILE_BUF_SIZE - lastExtractResult.getFirstUnprocessedOff()) > 0) {
////a string previously extracted
////shift the fileReadBuff past last bytes extracted
////read only what's needed to fill the buffer
////to avoid loosing chars and breaking or corrupting potential strings - preserve byte stream continuity
//byte[] temp = new byte[shiftSize];
//System.arraycopy(fileReadBuff, lastExtractResult.getFirstUnprocessedOff(),
// temp, 0, shiftSize);
//System.arraycopy(temp, 0, fileReadBuff, 0, shiftSize);
//toRead = Math.min(lastExtractResult.getFirstUnprocessedOff(), fileSize - fileReadOffset);
//lastExtractResult = null;
//} else {
//fill up entire fileReadBuff fresh
toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
//}
int read = content.read(fileReadBuff, fileReadOffset, toRead);
if (read == -1 || read == 0) {
fileEOF = true;
} else {
fileReadOffset += read;
if (fileReadOffset >= fileSize) {
fileEOF = true;
}
//put converted string in convertBuff
convert(read);
convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
}
} catch (TskCoreException ex) {
//Exceptions.printStackTrace(ex);
fileEOF = true;
}
}
//nothing more to read, and no more bytes in convertBuff
if (convertBuff == null || convertBuffRemain == 0) {
if (fileEOF) {
return bytesToUser > 0 ? bytesToUser : -1;
} else {
//no strings extracted, try another read
continue;
}
}
//return part or all of convert buff to user
final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
//DEBUG
/*
* if (toCopy > 0) { FileOutputStream debug = new
* FileOutputStream("c:\\temp\\" + content.getName(), true);
* debug.write(b, offsetUser, toCopy); debug.close(); }
*/
convertBuffOffset += toCopy;
offsetUser += toCopy;
bytesToUser += toCopy;
}
//if more string data in convertBuff, will be consumed on next read()
return bytesToUser;
}
/**
* convert bytes in file buffer to string, and encode string in
* convertBuffer
*
* @param numBytes num bytes in the fileReadBuff
*/
private void convert(int numBytes) {
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
convertBuff = lastExtractResult.getText().getBytes(outCharset);
//reset tracking vars
if (lastExtractResult.getNumBytes() == 0) {
bytesInConvertBuff = 0;
} else {
bytesInConvertBuff = convertBuff.length;
}
convertBuffOffset = 0;
}
}

View File

@ -1,296 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2012 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskException;
/**
* AbstractFile input string stream reader/converter - given AbstractFile,
* extract strings from it and return encoded bytes via read()
*
* Note: the utility supports extraction of only LATIN script and UTF8, UTF16LE,
* UTF16BE encodings and uses a brute force encoding detection - it's fast but
* could apply multiple encodings on the same string.
*
* For other script/languages support and better encoding detection use
* AbstractFileStringIntStream streaming class, which wraps around StringExtract
* extractor.
*/
class AbstractFileStringStream extends InputStream {
//args
private AbstractFile content;
private Charset outputCharset;
//internal data
private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
private static final String NLS = Character.toString((char) 10); //new line
private static final int READ_BUF_SIZE = 256;
private long contentOffset = 0; //offset in fscontent read into curReadBuf
private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
private int bytesInReadBuf = 0;
private int readBufOffset = 0; //offset in read buf processed
private StringBuilder curString = new StringBuilder();
private int curStringLen = 0;
private StringBuilder tempString = new StringBuilder();
private int tempStringLen = 0;
private boolean isEOF = false;
private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
private boolean inString = false; //if current temp has min chars required
private final byte[] oneCharBuf = new byte[1];
private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
/**
* Construct new string stream from FsContent
*
* @param content to extract strings from
* @param outputCharset target encoding to index as
* @param preserveOnBuffBoundary whether to preserve or split string on a
* buffer boundary. If false, will pack into
* read buffer up to max. possible,
* potentially splitting a string. If false,
* the string will be preserved for next read.
*/
public AbstractFileStringStream(AbstractFile content, Charset outputCharset, boolean preserveOnBuffBoundary) {
this.content = content;
this.outputCharset = outputCharset;
//this.preserveOnBuffBoundary = preserveOnBuffBoundary;
//logger.log(Level.INFO, "FILE: " + content.getParentPath() + "/" + content.getName());
}
/**
* Construct new string stream from FsContent Do not attempt to fill entire
* read buffer if that would break a string
*
* @param content to extract strings from
* @param outCharset target charset to encode into bytes and index as, e.g.
* UTF-8
*/
public AbstractFileStringStream(AbstractFile content, Charset outCharset) {
this(content, outCharset, false);
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
long fileSize = content.getSize();
if (fileSize == 0) {
return -1;
}
if (isEOF) {
return -1;
}
if (stringAtTempBoundary) {
//append entire temp string residual from previous read()
//because qualified string was broken down into 2 parts
appendResetTemp();
stringAtTempBoundary = false;
//there could be more to this string in fscontent/buffer
}
boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
int newCurLen = curStringLen + tempStringLen;
while (newCurLen < len) {
//need to extract more strings
if (readBufOffset > bytesInReadBuf - 1) {
//no more bytes to process into strings, read them
try {
bytesInReadBuf = 0;
bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
} catch (TskException ex) {
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
appendResetTemp();
//have some extracted string, return that, and fail next time
isEOF = true;
int copied = copyToReturn(b, off, len);
return copied;
} else {
return -1; //EOF
}
}
if (bytesInReadBuf < 1) {
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
appendResetTemp();
//have some extracted string, return that, and fail next time
isEOF = true;
int copied = copyToReturn(b, off, len);
return copied;
} else {
return -1; //EOF
}
}
//increment content offset for next read
contentOffset += bytesInReadBuf;
//reset read buf position
readBufOffset = 0;
}
//get char from cur read buf
char c = (char) curReadBuf[readBufOffset++];
if (c == 0 && singleConsecZero == false) {
//preserve the current sequence if max consec. 1 zero char
singleConsecZero = true;
} else {
singleConsecZero = false;
}
if (StringExtract.isPrintableAscii(c)) {
tempString.append(c);
++tempStringLen;
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
inString = true;
}
//boundary case when temp has still chars - handled after the loop
} else if (!singleConsecZero) {
//break the string, clear temp
if (tempStringLen >= MIN_PRINTABLE_CHARS
|| stringAtBufBoundary) {
//append entire temp string with new line
tempString.append(NLS);
++tempStringLen;
curString.append(tempString);
curStringLen += tempStringLen;
stringAtBufBoundary = false;
}
//reset temp
tempString = new StringBuilder();
tempStringLen = 0;
}
newCurLen = curStringLen + tempStringLen;
}
//check if still in string state, so that next chars in read buf bypass min chars check
//and qualify as string even if less < min chars required
if (inString) {
inString = false; //reset
stringAtBufBoundary = true; //will bypass the check
}
//check if temp still has chars to qualify as a string
//we might need to break up temp into 2 parts for next read() call
//consume as many as possible to fill entire user buffer
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
if (newCurLen > len) {
int appendChars = len - curStringLen;
//save part for next user read(), need to break up temp string
//do not append new line
String toAppend = tempString.substring(0, appendChars);
String newTemp = tempString.substring(appendChars);
curString.append(toAppend);
curStringLen += appendChars;
tempString = new StringBuilder(newTemp);
tempStringLen = newTemp.length();
stringAtTempBoundary = true;
} else {
//append entire temp
curString.append(tempString);
curStringLen += tempStringLen;
//reset temp
tempString = new StringBuilder();
tempStringLen = 0;
}
} else {
//if temp has a few chars, not qualified as string for now,
//will be processed during next read() call
}
//copy current strings to user
final int copied = copyToReturn(b, off, len);
//there may be still chars in read buffer or tempString, for next read()
return copied;
}
//append temp buffer to cur string buffer and reset temp, if enough chars
//does not append new line
private void appendResetTemp() {
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
curString.append(tempString);
curStringLen += tempStringLen;
tempString = new StringBuilder();
tempStringLen = 0;
}
}
//copy currently extracted string to user buffer
//and reset for next read() call
private int copyToReturn(byte[] b, int off, long len) {
final String curStringS = curString.toString();
//logger.log(Level.INFO, curStringS);
byte[] stringBytes = curStringS.getBytes(outputCharset);
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
//logger.log(Level.INFO, curStringS);
//copied all string, reset
curString = new StringBuilder();
int ret = curStringLen;
curStringLen = 0;
return ret;
}
@Override
public int read() throws IOException {
final int read = read(oneCharBuf, 0, 1);
if (read == 1) {
return oneCharBuf[0];
} else {
return -1;
}
}
@Override
public int available() throws IOException {
//we don't know how many bytes in curReadBuf may end up as strings
return 0;
}
@Override
public long skip(long n) throws IOException {
//use default implementation that reads into skip buffer
//but it could be more efficient
return super.skip(n);
}
}

View File

@ -0,0 +1,143 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.logging.Level;
import org.apache.commons.io.IOUtils;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Extracts text from artifacts by concatenating the values of all of the
* artifact's attributes.
*/
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
/**
* Get the Content that is the data source for the given artifact. //JMTODO:
* is there a prexisting method to do this?
*
* @param artifact
*
* @return The data source for the given artifact as a Content object, or
* null if it could not be found.
*
* @throws TskCoreException if there is a problem accessing the case db.
*/
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
Case currentCase;
try {
currentCase = Case.getCurrentCase();
} catch (IllegalStateException ignore) {
// thorown by Case.getCurrentCase() if currentCase is null
return null;
}
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
if (sleuthkitCase == null) {
return null;
}
Content dataSource;
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
if (abstractFile != null) {
dataSource = abstractFile.getDataSource();
} else {
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
}
if (dataSource == null) {
return null;
}
return dataSource;
}
@Override
boolean isDisabled() {
return false;
}
@Override
InputStream getInputStream(BlackboardArtifact artifact) {
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
try {
Content dataSource = getDataSource(artifact);
if (dataSource == null) {
return null;
}
for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : ");
// We have also discussed modifying BlackboardAttribute.getDisplayString()
// to magically format datetime attributes but that is complicated by
// the fact that BlackboardAttribute exists in Sleuthkit data model
// while the utility to determine the timezone to use is in ContentUtils
// in the Autopsy datamodel.
switch (attribute.getValueType()) {
case DATETIME:
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
break;
default:
artifactContents.append(attribute.getDisplayString());
}
artifactContents.append(System.lineSeparator());
}
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex);
return null;
}
if (artifactContents.length() == 0) {
return null;
}
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
}
@Override
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
return new InputStreamReader(stream, StandardCharsets.UTF_8);
}
@Override
long getID(BlackboardArtifact source) {
return source.getArtifactID();
}
@Override
String getName(BlackboardArtifact source) {
return source.getDisplayName() + "_" + source.getArtifactID();
}
}

View File

@ -1,102 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.apache.solr.common.util.ContentStream;
import org.sleuthkit.datamodel.AbstractContent;
/**
* Stream of bytes representing string with specified encoding to feed into Solr
* as ContentStream
*/
class ByteContentStream implements ContentStream {
//input
private byte[] content; //extracted subcontent
private long contentSize;
private AbstractContent aContent; //origin
private Charset charset; //output byte stream charset of encoded strings
private InputStream stream;
private static Logger logger = Logger.getLogger(ByteContentStream.class.getName());
public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent, Charset charset) {
this.content = content;
this.aContent = aContent;
this.charset = charset;
stream = new ByteArrayInputStream(content, 0, (int) contentSize);
}
public byte[] getByteContent() {
return content;
}
public AbstractContent getSourceContent() {
return aContent;
}
@Override
public String getContentType() {
return "text/plain;charset=" + charset.name(); //NON-NLS
}
@Override
public String getName() {
return aContent.getName();
}
@Override
public Reader getReader() throws IOException {
return new InputStreamReader(stream);
}
@Override
public Long getSize() {
return contentSize;
}
@Override
public String getSourceInfo() {
return NbBundle.getMessage(this.getClass(), "ByteContentStream.getSrcInfo.text", aContent.getId());
}
@Override
public InputStream getStream() throws IOException {
return stream;
}
@Override
protected void finalize() throws Throwable {
super.finalize();
stream.close();
}
}

View File

@ -0,0 +1,112 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.InputStream;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
import org.sleuthkit.datamodel.AbstractFile;
/**
* Common methods for utilities that extract text and content and divide into
* chunks
*/
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
static final List<String> BLOB_MIME_TYPES
= Arrays.asList(
//ignore binary blob data, for which string extraction will be used
"application/octet-stream", //NON-NLS
"application/x-msdownload"); //NON-NLS
/** generally text extractors should ignore archives and let unpacking
* modules take care of them */
static final List<String> ARCHIVE_MIME_TYPES
= Arrays.asList(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
/**
* Determines if the extractor works only for specified types is
* supportedTypes() or whether is a generic content extractor (such as
* string extractor)
*
* @return
*/
abstract boolean isContentTypeSpecific();
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
*
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
*
* @return true if the file content is supported, false otherwise
*/
abstract boolean isSupported(AbstractFile file, String detectedFormat);
@Override
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
@Override
long getID(AbstractFile source) {
return source.getId();
}
@Override
String getName(AbstractFile source) {
return source.getName();
}
}

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2012-2013 Basis Technology Corp.
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -21,36 +21,23 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Renderer;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extractor of text from HTML supported AbstractFile content. Extracted text is
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
* chunks are indexed with Solr.
* Extracts text from AbstractFile HTML content.
*/
class HtmlTextExtractor implements TextExtractor {
class HtmlTextExtractor extends FileTextExtractor {
private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
private static Ingester ingester;
static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
static final int MAX_EXTR_TEXT_CHARS = 31 * 1024;
private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace
private static final int MAX_SIZE = 50000000;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private AbstractFile sourceFile;
private int numChunks = 0;
private static final int MAX_SIZE = 50_000_000; //50MB
static final List<String> WEB_MIME_TYPES = Arrays.asList(
"application/javascript", //NON-NLS
@ -59,170 +46,124 @@ class HtmlTextExtractor implements TextExtractor {
"text/css", //NON-NLS
"text/html", //NON-NLS NON-NLS
"text/javascript" //NON-NLS
//"application/xml",
//"application/xml-dtd",
);
HtmlTextExtractor() {
ingester = Ingester.getDefault();
}
@Override
public boolean setScripts(List<SCRIPT> extractScripts) {
return false;
}
@Override
public List<SCRIPT> getScripts() {
return null;
}
@Override
public Map<String, String> getOptions() {
return null;
}
@Override
public void setOptions(Map<String, String> options) {
}
@Override
public int getNumChunks() {
return numChunks;
}
@Override
public AbstractFile getSourceFile() {
return sourceFile;
}
@Override
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
this.sourceFile = sourceFile;
numChunks = 0; //unknown until indexing is done
boolean success = false;
Reader reader = null;
final InputStream stream = new ReadContentInputStream(sourceFile);
try {
// Parse the stream with Jericho
JerichoParserWrapper jpw = new JerichoParserWrapper(stream);
jpw.parse();
reader = jpw.getReader();
// In case there is an exception or parse() isn't called
if (reader == null) {
logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS
return false;
}
success = true;
long readSize;
long totalRead = 0;
boolean eof = false;
//we read max 1024 chars at time, this seems to max what this Reader would return
while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
if (context.fileIngestIsCancelled()) {
ingester.ingest(this);
return true;
}
totalRead += readSize;
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
} else {
//try to read until whitespace to not break words
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
&& !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
&& (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
}
}
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
//encode to bytes to index as byte stream
String extracted;
//add BOM and trim the 0 bytes
//set initial size to chars read + bom - try to prevent from resizing
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
//sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
if (totalRead < MAX_EXTR_TEXT_CHARS) {
sb.append(textChunkBuf, 0, (int) totalRead);
} else {
sb.append(textChunkBuf);
}
//reset for next chunk
totalRead = 0;
extracted = sb.toString();
//converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(outCharset);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
try {
chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
++this.numChunks;
} catch (Ingester.IngesterException ingEx) {
success = false;
logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
throw ingEx; //need to rethrow/return to signal error and move on
}
}
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
success = false;
} catch (Exception ex) {
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
success = false;
} finally {
try {
stream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
}
try {
if (reader != null) {
reader.close();
}
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
}
}
//after all chunks, ingest the parent file without content itself, and store numChunks
ingester.ingest(this);
return success;
}
@Override
public boolean isContentTypeSpecific() {
boolean isContentTypeSpecific() {
return true;
}
@Override
public boolean isSupported(AbstractFile file, String detectedFormat) {
if (detectedFormat == null) {
return false;
} else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {
return true;
} else {
return false;
boolean isSupported(AbstractFile file, String detectedFormat) {
return detectedFormat != null
&& WEB_MIME_TYPES.contains(detectedFormat)
&& file.getSize() <= MAX_SIZE;
}
@Override
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
//Parse the stream with Jericho and put the results in a Reader
try {
StringBuilder scripts = new StringBuilder();
StringBuilder links = new StringBuilder();
StringBuilder images = new StringBuilder();
StringBuilder comments = new StringBuilder();
StringBuilder others = new StringBuilder();
int numScripts = 0;
int numLinks = 0;
int numImages = 0;
int numComments = 0;
int numOthers = 0;
Source source = new Source(in);
source.fullSequentialParse();
Renderer renderer = source.getRenderer();
renderer.setNewLine("\n");
renderer.setIncludeHyperlinkURLs(false);
renderer.setDecorateFontStyles(false);
renderer.setIncludeAlternateText(false);
String text = renderer.toString();
// Get all the tags in the source
List<StartTag> tags = source.getAllStartTags();
StringBuilder stringBuilder = new StringBuilder();
for (StartTag tag : tags) {
if (tag.getName().equals("script")) { //NON-NLS
// If the <script> tag has attributes
numScripts++;
scripts.append(numScripts).append(") ");
if (tag.getTagContent().length() > 0) {
scripts.append(tag.getTagContent()).append(" ");
}
// Get whats between the <script> .. </script> tags
scripts.append(tag.getElement().getContent()).append("\n");
} else if (tag.getName().equals("a")) {
//NON-NLS
numLinks++;
links.append(numLinks).append(") ");
links.append(tag.getTagContent()).append("\n");
} else if (tag.getName().equals("img")) {
//NON-NLS
numImages++;
images.append(numImages).append(") ");
images.append(tag.getTagContent()).append("\n");
} else if (tag.getTagType().equals(StartTagType.COMMENT)) {
numComments++;
comments.append(numComments).append(") ");
comments.append(tag.getTagContent()).append("\n");
} else {
// Make sure it has an attribute
Attributes atts = tag.getAttributes();
if (atts != null && atts.length() > 0) {
numOthers++;
others.append(numOthers).append(") ");
others.append(tag.getName()).append(":");
others.append(tag.getTagContent()).append("\n");
}
}
}
stringBuilder.append(text).append("\n\n");
stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
if (numScripts > 0) {
stringBuilder.append("---Scripts---\n"); //NON-NLS
stringBuilder.append(scripts).append("\n");
}
if (numLinks > 0) {
stringBuilder.append("---Links---\n"); //NON-NLS
stringBuilder.append(links).append("\n");
}
if (numImages > 0) {
stringBuilder.append("---Images---\n"); //NON-NLS
stringBuilder.append(images).append("\n");
}
if (numComments > 0) {
stringBuilder.append("---Comments---\n"); //NON-NLS
stringBuilder.append(comments).append("\n");
}
if (numOthers > 0) {
stringBuilder.append("---Others---\n"); //NON-NLS
stringBuilder.append(others).append("\n");
}
// All done, now make it a reader
return new StringReader(stringBuilder.toString());
} catch (IOException ex) {
throw new Ingester.IngesterException("Error extracting HTML from content.", ex);
}
}
@Override
InputStream getInputStream(AbstractFile sourceFile1) {
return new ReadContentInputStream(sourceFile1);
}
@Override
boolean isDisabled() {
return false;
}
}

View File

@ -18,49 +18,45 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.SolrInputDocument;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.TextUtil;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractContent;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ContentVisitor;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.DerivedFile;
import org.sleuthkit.datamodel.Directory;
import org.sleuthkit.datamodel.File;
import org.sleuthkit.datamodel.LayoutFile;
import org.sleuthkit.datamodel.LocalFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.SlackFile;
import org.sleuthkit.datamodel.SleuthkitItemVisitor;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Handles indexing files on a Solr core.
*/
//JMTODO: Should this class really be a singleton?
class Ingester {
private static final Logger logger = Logger.getLogger(Ingester.class.getName());
private volatile boolean uncommitedIngests = false;
private final Server solrServer = KeywordSearch.getServer();
private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
private static Ingester instance;
//for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
//TODO use a streaming way to add content to /update handler
private static final int MAX_DOC_CHUNK_SIZE = 32 * 1024;
private static final String ENCODING = "UTF-8"; //NON-NLS
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128;
private Ingester() {
}
@ -72,6 +68,7 @@ class Ingester {
return instance;
}
//JMTODO: this is probably useless
@Override
@SuppressWarnings("FinalizeDeclaration")
protected void finalize() throws Throwable {
@ -84,123 +81,68 @@ class Ingester {
}
/**
* Sends a stream to Solr to have its content extracted and added to the
* index. commit() should be called once you're done ingesting files.
* Sends the metadata (name, MAC times, image id, etc) for the given file to
* Solr to be added to the index. commit() should be called once you're done
* indexing.
*
* @param afscs File AbstractFileStringContentStream to ingest
* @param file File to index.
*
* @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine.
*/
void ingest(AbstractFileStringContentStream afscs) throws IngesterException {
Map<String, String> params = getContentFields(afscs.getSourceContent());
ingest(afscs, params, afscs.getSourceContent().getSize());
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
indexChunk("", file.getName(), getContentFields(file));
}
/**
* Sends a TextExtractor to Solr to have its content extracted and added to
* the index. commit() should be called once you're done ingesting files.
* FileExtract represents a parent of extracted file with actual content.
* The parent itself has no content, only meta data and is used to associate
* the extracted AbstractFileChunk
* Sends the metadata (artifact id, image id, etc) for the given artifact to
* Solr to be added to the index. commit() should be called once you're done
* indexing.
*
* @param fe TextExtractor to ingest
* @param artifact The artifact to index.
*
* @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine.
* artifact, but the Solr server is probably fine.
*/
void ingest(TextExtractor fe) throws IngesterException {
Map<String, String> params = getContentFields(fe.getSourceFile());
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
ingest(new NullContentStream(fe.getSourceFile()), params, 0);
void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact));
}
/**
* Sends a AbstractFileChunk to Solr and its extracted content stream to be
* added to the index. commit() should be called once you're done ingesting
* files. AbstractFileChunk represents a file chunk and its chunk content.
* Creates a field map from a SleuthkitVisitableItem, that is later sent to
* Solr.
*
* @param fec AbstractFileChunk to ingest
* @param size approx. size of the stream in bytes, used for timeout
* estimation
* @param item SleuthkitVisitableItem to get fields from
*
* @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine.
* @return the map from field name to value (as a string)
*/
void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException {
AbstractContent sourceContent = bcs.getSourceContent();
Map<String, String> params = getContentFields(sourceContent);
//overwrite id with the chunk id
params.put(Server.Schema.ID.toString(),
Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber()));
ingest(bcs, params, size);
private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
return item.accept(SOLR_FIELDS_VISITOR);
}
/**
* Sends a file to Solr to have its content extracted and added to the
* index. commit() should be called once you're done ingesting files. If the
* file is a directory or ingestContent is set to false, the file name is
* indexed only.
*
* @param file File to ingest
* @param ingestContent if true, index the file and the content, otherwise
* indesx metadata only
*
* @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine.
* Visitor used to create fields to send to SOLR index.
*/
void ingest(AbstractFile file, boolean ingestContent) throws IngesterException {
if (ingestContent == false || file.isDir()) {
ingest(new NullContentStream(file), getContentFields(file), 0);
} else {
ingest(new FscContentStream(file), getContentFields(file), file.getSize());
}
}
/**
* Creates a field map from FsContent, that is later sent to Solr
*
* @param fsc FsContent to get fields from
*
* @return the map
*/
private Map<String, String> getContentFields(AbstractContent fsc) {
return fsc.accept(getContentFieldsV);
}
/**
* Visitor used to create param list to send to SOLR index.
*/
private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
@Override
protected Map<String, String> defaultVisit(Content cntnt) {
protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
return new HashMap<>();
}
@Override
public Map<String, String> visit(File f) {
Map<String, String> params = getCommonFields(f);
getCommonFileContentFields(params, f);
return params;
return getCommonAndMACTimeFields(f);
}
@Override
public Map<String, String> visit(DerivedFile df) {
Map<String, String> params = getCommonFields(df);
getCommonFileContentFields(params, df);
return params;
return getCommonAndMACTimeFields(df);
}
@Override
public Map<String, String> visit(Directory d) {
Map<String, String> params = getCommonFields(d);
getCommonFileContentFields(params, d);
return params;
return getCommonAndMACTimeFields(d);
}
@Override
@ -211,19 +153,25 @@ class Ingester {
@Override
public Map<String, String> visit(LocalFile lf) {
Map<String, String> params = getCommonFields(lf);
getCommonFileContentFields(params, lf);
return params;
return getCommonAndMACTimeFields(lf);
}
@Override
public Map<String, String> visit(SlackFile f) {
Map<String, String> params = getCommonFields(f);
getCommonFileContentFields(params, f);
return params;
return getCommonAndMACTimeFields(f);
}
private Map<String, String> getCommonFileContentFields(Map<String, String> params, AbstractFile file) {
/**
* Get the field map for AbstractFiles that includes MAC times and the
* fields that are common to all file classes.
*
* @param file The file to get fields for
*
* @return The field map, including MAC times and common fields, for the
* give file.
*/
private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
Map<String, String> params = getCommonFields(file);
params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file));
@ -231,140 +179,219 @@ class Ingester {
return params;
}
/**
* Get the field map for AbstractFiles that is common to all file
* classes
*
* @param file The file to get fields for
*
* @return The field map of fields that are common to all file classes.
*/
private Map<String, String> getCommonFields(AbstractFile af) {
Map<String, String> params = new HashMap<>();
params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
try {
long dataSourceId = af.getDataSource().getId();
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(af.getDataSource().getId()));
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS
logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId(), ex); //NON-NLS
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
}
params.put(Server.Schema.FILE_NAME.toString(), af.getName());
return params;
}
/**
* Get the field map for artifacts.
*
* @param artifact The artifact to get fields for.
*
* @return The field map for the given artifact.
*/
@Override
public Map<String, String> visit(BlackboardArtifact artifact) {
Map<String, String> params = new HashMap<>();
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
try {
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId()));
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
}
return params;
}
}
/**
* Indexing method that bypasses Tika, assumes pure text It reads and
* converts the entire content stream to string, assuming UTF8 since we
* can't use streaming approach for Solr /update handler. This should be
* safe, since all content is now in max 1MB chunks.
* Use the given TextExtractor to extract text from the given source. The
* text will be chunked and each chunk passed to Solr to add to the index.
*
*
* @param <A> The type of the Appendix provider that provides
* additional text to append to the final chunk.
* @param <T> A subclass of SleuthkitVisibleItem.
* @param extractor The TextExtractor that will be used to extract text from
* the given source.
* @param source The source from which text will be extracted, chunked,
* and indexed.
* @param context The ingest job context that can be used to cancel this
* process.
*
* @return True if this method executed normally. or False if there was an
* unexpected exception. //JMTODO: This policy needs to be reviewed.
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
final long sourceID = extractor.getID(source);
final String sourceName = extractor.getName(source);
int numChunks = 0; //unknown until chunking is done
if (extractor.isDisabled()) {
/* some Extrctors, notable the strings extractor, have options which
* can be configured such that no extraction should be done */
return true;
}
Map<String, String> fields = getContentFields(source);
//Get a stream and a reader for that stream
try (final InputStream stream = extractor.getInputStream(source);
Reader reader = extractor.getReader(stream, source);) {
//we read max 1024 chars at time, this seems to max what some Readers would return
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
boolean eof = false; //have we read until the end of the file yet
while (!eof) {
int chunkSizeInChars = 0; // the size in chars of the chunk (so far)
if (context != null && context.fileIngestIsCancelled()) {
return true;
}
long charsRead = 0; // number of chars read in the most recent read operation
//consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
chunkSizeInChars += charsRead;
}
if (charsRead == -1) {
//this is the last chunk
eof = true;
} else {
chunkSizeInChars += charsRead;
//if we haven't reached the end of the file,
//try to read char-by-char until whitespace to not break words
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
&& (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
chunkSizeInChars += charsRead;
}
if (charsRead == -1) {
//this is the last chunk
eof = true;
}
}
StringBuilder sb = new StringBuilder(chunkSizeInChars)
.append(textChunkBuf, 0, chunkSizeInChars);
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
fields.put(Server.Schema.ID.toString(), chunkId);
try {
//pass the chunk to method that adds it to Solr index
indexChunk(sb.toString(), sourceName, fields);
numChunks++;
} catch (Ingester.IngesterException ingEx) {
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
throw ingEx; //need to rethrow to signal error and move on
} catch (Exception ex) {
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
}
}
} catch (IOException ex) {
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} catch (Exception ex) {
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
return false;
} finally {
//after all chunks, index just the meta data, including the numChunks, of the parent file
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
indexChunk(null, sourceName, fields);
}
return true;
}
/**
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
* caret '^'
*
* @param sb the StringBuilder to sanitize
*
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
* function?
*/
private static void sanitizeToUTF8(StringBuilder sb) {
final int length = sb.length();
// Sanitize by replacing non-UTF-8 characters with caret '^'
for (int i = 0; i < length; i++) {
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
sb.replace(i, i + 1, "^");
}
}
}
/**
* Add one chunk as to the Solr index as a seperate sold document.
*
* TODO see if can use a byte or string streaming way to add content to
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
* 4.0.0), see if possible to stream with UpdateRequestHandler
*
* @param cs
* @param chunk The chunk content as a string
* @param fields
* @param size
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
void ingest(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
//JMTODO: actually if the we couldn't get the image id it is set to -1,
// but does this really mean we don't want to index it?
//skip the file, image id unknown
String msg = NbBundle.getMessage(this.getClass(),
"Ingester.ingest.exception.unknownImgId.msg", cs.getName());
//JMTODO: does this need to ne internationalized?
String msg = NbBundle.getMessage(Ingester.class,
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
logger.log(Level.SEVERE, msg);
throw new IngesterException(msg);
}
final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE];
//Make a SolrInputDocument out of the field map
SolrInputDocument updateDoc = new SolrInputDocument();
for (String key : fields.keySet()) {
updateDoc.addField(key, fields.get(key));
}
//using size here, but we are no longer ingesting entire files
//size is normally a chunk size, up to 1MB
if (size > 0) {
// TODO (RC): Use try with resources, adjust exception messages
InputStream is = null;
int read = 0;
try {
is = cs.getStream();
read = is.read(docChunkContentBuf);
} catch (IOException ex) {
throw new IngesterException(
NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg",
cs.getName()));
} finally {
if (null != is) {
try {
is.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS
}
}
}
if (read != 0) {
String s = "";
try {
s = new String(docChunkContentBuf, 0, read, ENCODING);
// Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index
char[] chars = null;
for (int i = 0; i < s.length(); i++) {
if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
// only convert string to char[] if there is a non-UTF8 character
if (chars == null) {
chars = s.toCharArray();
}
chars[i] = '^';
}
}
// check if the string was modified (i.e. there was a non-UTF8 character found)
if (chars != null) {
s = new String(chars);
}
} catch (UnsupportedEncodingException ex) {
logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS
}
updateDoc.addField(Server.Schema.CONTENT.toString(), s);
} else {
updateDoc.addField(Server.Schema.CONTENT.toString(), "");
}
} else {
//no content, such as case when 0th chunk indexed
updateDoc.addField(Server.Schema.CONTENT.toString(), "");
}
//add the content to the SolrInputDocument
//JMTODO: can we just add it to the field map before passing that in?
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
try {
//TODO consider timeout thread, or vary socket timeout based on size of indexed content
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
solrServer.addDocument(updateDoc);
uncommitedIngests = true;
} catch (KeywordSearchModuleException ex) {
//JMTODO: does this need to ne internationalized?
throw new IngesterException(
NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex);
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
}
}
/**
* return timeout that should be used to index the content
*
* @param size size of the content
*
* @return time in seconds to use a timeout
*/
static int getTimeout(long size) {
if (size < 1024 * 1024L) //1MB
{
return 60;
} else if (size < 10 * 1024 * 1024L) //10MB
{
return 1200;
} else if (size < 100 * 1024 * 1024L) //100MB
{
return 3600;
} else {
return 3 * 3600;
}
}
/**
@ -380,92 +407,6 @@ class Ingester {
}
}
/**
* ContentStream to read() the data from a FsContent object
*/
private static class FscContentStream implements ContentStream {
private AbstractFile f;
FscContentStream(AbstractFile f) {
this.f = f;
}
@Override
public String getName() {
return f.getName();
}
@Override
public String getSourceInfo() {
return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId());
}
@Override
public String getContentType() {
return null;
}
@Override
public Long getSize() {
return f.getSize();
}
@Override
public InputStream getStream() throws IOException {
return new ReadContentInputStream(f);
}
@Override
public Reader getReader() throws IOException {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader"));
}
}
/**
* ContentStream associated with FsContent, but forced with no content
*/
private static class NullContentStream implements ContentStream {
AbstractContent aContent;
NullContentStream(AbstractContent aContent) {
this.aContent = aContent;
}
@Override
public String getName() {
return aContent.getName();
}
@Override
public String getSourceInfo() {
return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId());
}
@Override
public String getContentType() {
return null;
}
@Override
public Long getSize() {
return 0L;
}
@Override
public InputStream getStream() throws IOException {
return new ByteArrayInputStream(new byte[0]);
}
@Override
public Reader getReader() throws IOException {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
}
}
/**
* Indicates that there was an error with the specific ingest operation, but
* it's still okay to continue ingesting files.

View File

@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
private void reloadScriptsCheckBoxes() {
boolean utf16
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
enableUTF16Checkbox.setSelected(utf16);
boolean utf8
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
enableUTF8Checkbox.setSelected(utf8);
final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
reloadScriptsCheckBoxes();
boolean utf16
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
enableUTF16Checkbox.setSelected(utf16);
boolean utf8
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
enableUTF8Checkbox.setSelected(utf8);
final boolean extractEnabled = utf16 || utf8;
@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
@Override
public void store() {
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
Boolean.toString(enableUTF8Checkbox.isSelected()));
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
Boolean.toString(enableUTF16Checkbox.isSelected()));
if (toUpdate != null) {

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2015 Basis Technology Corp.
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
//accessed read-only by searcher thread
private boolean startedSearching = false;
private List<TextExtractor> textExtractors;
private List<FileTextExtractor> textExtractors;
private StringsTextExtractor stringExtractor;
private final KeywordSearchJobSettings settings;
private boolean initialized = false;
@ -415,24 +415,24 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* @throws IngesterException exception thrown if indexing failed
*/
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
TextExtractor fileExtract = null;
FileTextExtractor extractor = null;
//go over available text extractors in order, and pick the first one (most specific one)
for (TextExtractor fe : textExtractors) {
for (FileTextExtractor fe : textExtractors) {
if (fe.isSupported(aFile, detectedFormat)) {
fileExtract = fe;
extractor = fe;
break;
}
}
if (fileExtract == null) {
if (extractor == null) {
logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
return false;
}
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
//divide into chunks and index
return fileExtract.index(aFile, context);
return Ingester.getDefault().indexText(extractor, aFile, context);
}
/**
@ -448,7 +448,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) {
return true;
}
if (stringExtractor.index(aFile, KeywordSearchIngestModule.this.context)) {
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
return true;
} else {
@ -463,26 +463,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
}
}
/**
* Check with every extractor if it supports the file with the detected
* format
*
* @param aFile file to check for
* @param detectedFormat mime-type with detected format (such as
* text/plain) or null if not detected
*
* @return true if text extraction is supported
*/
private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
for (TextExtractor extractor : textExtractors) {
if (extractor.isContentTypeSpecific() == true
&& extractor.isSupported(aFile, detectedFormat)) {
return true;
}
}
return false;
}
/**
* Adds the file to the index. Detects file type, calls extractors, etc.
*
@ -512,7 +492,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
if (context.fileIngestIsCancelled()) {
return;
}
ingester.ingest(aFile, false); //meta-data only
ingester.indexMetaDataOnly(aFile);
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
} catch (IngesterException ex) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
@ -534,12 +514,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
// we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though...
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
try {
if (context.fileIngestIsCancelled()) {
return;
}
ingester.ingest(aFile, false); //meta-data only
ingester.indexMetaDataOnly(aFile);
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
} catch (IngesterException ex) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);

View File

@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
}
private void displayEncodings() {
String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
String utf8 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
String utf16 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
ArrayList<String> encodingsList = new ArrayList<>();
if (utf8 == null || Boolean.parseBoolean(utf8)) {
encodingsList.add("UTF8");

View File

@ -23,7 +23,6 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.ModuleSettings;
@ -211,14 +210,14 @@ class KeywordSearchSettings {
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
}
//setting default Extract UTF8
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
}
//setting default Extract UTF16
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
}
//setting default Latin-1 Script
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2015 Basis Technology Corp.
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -19,24 +19,16 @@
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.util.HashMap;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.apache.solr.common.util.ContentStreamBase.StringStream;
import org.openide.util.lookup.ServiceProvider;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.openide.util.NbBundle;
import java.net.InetAddress;
import java.util.MissingResourceException;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.openide.util.NbBundle;
import org.openide.util.lookup.ServiceProvider;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.TskCoreException;
/**
* An implementation of the KeywordSearchService interface that uses Solr for
@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
ArtifactTextExtractor extractor = new ArtifactTextExtractor();
@Override
public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
if (artifact == null) {
@ -57,109 +51,14 @@ public class SolrSearchService implements KeywordSearchService {
// We only support artifact indexing for Autopsy versions that use
// the negative range for artifact ids.
long artifactId = artifact.getArtifactID();
if (artifactId > 0) {
if (artifact.getArtifactID() > 0) {
return;
}
Case currentCase;
try {
currentCase = Case.getCurrentCase();
} catch (IllegalStateException ignore) {
// thorown by Case.getCurrentCase() if currentCase is null
return;
}
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
if (sleuthkitCase == null) {
return;
}
Content dataSource;
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
if (abstractFile != null) {
dataSource = abstractFile.getDataSource();
} else {
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
}
if (dataSource == null) {
return;
}
// Concatenate the string values of all attributes into a single
// "content" string to be indexed.
StringBuilder artifactContents = new StringBuilder();
for (BlackboardAttribute attribute : artifact.getAttributes()) {
artifactContents.append(attribute.getAttributeType().getDisplayName());
artifactContents.append(" : ");
// This is ugly since it will need to updated any time a new
// TSK_DATETIME_* attribute is added. A slightly less ugly
// alternative would be to assume that all date time attributes
// will have a name of the form "TSK_DATETIME*" and check
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
// The major problem with that approach is that it would require
// a round trip to the database to get the type name string.
// We have also discussed modifying BlackboardAttribute.getDisplayString()
// to magically format datetime attributes but that is complicated by
// the fact that BlackboardAttribute exists in Sleuthkit data model
// while the utility to determine the timezone to use is in ContentUtils
// in the Autopsy datamodel.
if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
} else {
artifactContents.append(attribute.getDisplayString());
}
artifactContents.append(System.lineSeparator());
}
if (artifactContents.length() == 0) {
return;
}
// To play by the rules of the existing text markup implementations,
// we need to (a) index the artifact contents in a "chunk" and
// (b) create a separate index entry for the base artifact.
// We distinguish artifact content from file content by applying a
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
// First, create an index entry for the base artifact.
HashMap<String, String> solrFields = new HashMap<>();
String documentId = Long.toString(artifactId);
solrFields.put(Server.Schema.ID.toString(), documentId);
// Set the IMAGE_ID field.
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
final Ingester ingester = Ingester.getDefault();
try {
Ingester.getDefault().ingest(new StringStream(""), solrFields, 0);
} catch (Ingester.IngesterException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex);
}
// Next create the index entry for the document content.
// The content gets added to a single chunk. We may need to add chunking
// support later.
long chunkId = 1;
documentId += "_" + Long.toString(chunkId);
solrFields.replace(Server.Schema.ID.toString(), documentId);
StringStream contentStream = new StringStream(artifactContents.toString());
try {
Ingester.getDefault().ingest(contentStream, solrFields, contentStream.getSize());
ingester.indexMetaDataOnly(artifact);
ingester.indexText(extractor, artifact, null);
} catch (Ingester.IngesterException ex) {
throw new TskCoreException(ex.getCause().getMessage(), ex);
}

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2014 Basis Technology Corp.
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -20,155 +20,106 @@ package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskException;
/**
* Takes an AbstractFile, extract strings, converts into chunks (associated with
* the original source file) up to 1MB then and indexes chunks as text with Solr
* Extracts raw strings from AbstractFile content.
*/
class StringsTextExtractor implements TextExtractor {
class StringsTextExtractor extends FileTextExtractor {
/**
* Options for this extractor
*/
enum ExtractOptions {
EXTRACT_UTF16, ///< extract UTF16 text, true/false
EXTRACT_UTF8, ///< extract UTF8 text, true/false
};
private static Ingester ingester;
private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
private static final long MAX_STRING_CHUNK_SIZE = 1 * 31 * 1024L;
//private static final int BOM_LEN = 3;
private static final int BOM_LEN = 0; //disabled prepending of BOM
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
private AbstractFile sourceFile;
private int numChunks = 0;
private final List<SCRIPT> extractScripts = new ArrayList<>();
private Map<String, String> extractOptions = new HashMap<>();
//disabled prepending of BOM
//static {
//prepend UTF-8 BOM to start of the buffer
//stringChunkBuf[0] = (byte) 0xEF;
//stringChunkBuf[1] = (byte) 0xBB;
//stringChunkBuf[2] = (byte) 0xBF;
//}
public StringsTextExtractor() {
ingester = Ingester.getDefault();
extractScripts.add(DEFAULT_SCRIPT);
//LATIN_2 is the default script
extractScripts.add(SCRIPT.LATIN_2);
}
@Override
public boolean setScripts(List<SCRIPT> extractScripts) {
/**
* Sets the scripts to use for the extraction
*
* @param extractScripts scripts to use
*/
public void setScripts(List<SCRIPT> extractScripts) {
this.extractScripts.clear();
this.extractScripts.addAll(extractScripts);
return true;
}
@Override
/**
* Get the currently used scripts for extraction
*
* @return scripts currently used or null if not supported
*/
public List<SCRIPT> getScripts() {
return new ArrayList<>(extractScripts);
}
@Override
public int getNumChunks() {
return this.numChunks;
}
@Override
public AbstractFile getSourceFile() {
return sourceFile;
}
@Override
/**
* Get current options
*
* @return currently used, extractor specific options, or null of not
* supported
*/
public Map<String, String> getOptions() {
return extractOptions;
}
@Override
/**
* Set extractor specific options
*
* @param options options to use
*/
public void setOptions(Map<String, String> options) {
this.extractOptions = options;
}
@Override
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
this.sourceFile = sourceFile;
this.numChunks = 0; //unknown until indexing is done
boolean success = false;
boolean isDisabled() {
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
final boolean extractUTF8
= Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
final boolean extractUTF16
= Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
if (extractUTF8 == false && extractUTF16 == false) {
//nothing to do
return true;
return extractUTF8 == false && extractUTF16 == false;
}
InputStream stringStream;
@Override
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
}
@Override
InputStream getInputStream(AbstractFile sourceFile) {
//check which extract stream to use
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
//optimal for english, english only
stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
return new EnglishOnlyStream(sourceFile);//optimal for english, english only
} else {
stringStream = new AbstractFileStringIntStream(
sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
return new InternationalStream(sourceFile, extractScripts, extractUTF8, extractUTF16);
}
try {
success = true;
//break input stream into chunks
final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
long readSize;
while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
if (context.fileIngestIsCancelled()) {
ingester.ingest(this);
return true;
}
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
//debug.write(stringChunkBuf, 0, (int)readSize);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
try {
chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
++this.numChunks;
} catch (IngesterException ingEx) {
success = false;
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
throw ingEx; //need to rethrow/return to signal error and move on
}
//debug.close();
}
//after all chunks, ingest the parent file without content itself, and store numChunks
ingester.ingest(this);
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
success = false;
} finally {
try {
stringStream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
}
}
return success;
}
@Override
public boolean isContentTypeSpecific() {
return true;
return false;
}
@Override
@ -176,4 +127,379 @@ class StringsTextExtractor implements TextExtractor {
// strings can be run on anything.
return true;
}
/**
* AbstractFile input string stream reader/converter - given AbstractFile,
* extract strings from it and return encoded bytes via read()
*
* Note: the utility supports extraction of only LATIN script and UTF8,
* UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
* it's fast but could apply multiple encodings on the same string.
*
* For other script/languages support and better encoding detection use
* AbstractFileStringIntStream streaming class, which wraps around
* StringExtract extractor.
*/
private static class EnglishOnlyStream extends InputStream {
private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
private static final String NLS = Character.toString((char) 10); //new line
private static final int READ_BUF_SIZE = 256;
private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
//args
private final AbstractFile content;
//internal working data
private long contentOffset = 0; //offset in fscontent read into curReadBuf
private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
private int bytesInReadBuf = 0;
private int readBufOffset = 0; //offset in read buf processed
private StringBuilder curString = new StringBuilder();
private int curStringLen = 0;
private StringBuilder tempString = new StringBuilder();
private int tempStringLen = 0;
private boolean isEOF = false;
private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
private boolean inString = false; //if current temp has min chars required
private final byte[] oneCharBuf = new byte[1];
/**
* Construct new string stream from FsContent Do not attempt to fill
* entire read buffer if that would break a string
*
* @param content to extract strings from
* @param outputCharset target charset to encode into bytes and index
* as, e.g. UTF-8
*
*/
private EnglishOnlyStream(AbstractFile content) {
this.content = content;
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
long fileSize = content.getSize();
if (fileSize == 0) {
return -1;
}
if (isEOF) {
return -1;
}
if (stringAtTempBoundary) {
//append entire temp string residual from previous read()
//because qualified string was broken down into 2 parts
appendResetTemp();
stringAtTempBoundary = false;
//there could be more to this string in fscontent/buffer
}
boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
int newCurLen = curStringLen + tempStringLen;
while (newCurLen < len) {
//need to extract more strings
if (readBufOffset > bytesInReadBuf - 1) {
//no more bytes to process into strings, read them
try {
bytesInReadBuf = 0;
bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
} catch (TskException ex) {
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
appendResetTemp();
//have some extracted string, return that, and fail next time
isEOF = true;
int copied = copyToReturn(b, off, len);
return copied;
} else {
return -1; //EOF
}
}
if (bytesInReadBuf < 1) {
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
appendResetTemp();
//have some extracted string, return that, and fail next time
isEOF = true;
int copied = copyToReturn(b, off, len);
return copied;
} else {
return -1; //EOF
}
}
//increment content offset for next read
contentOffset += bytesInReadBuf;
//reset read buf position
readBufOffset = 0;
}
//get char from cur read buf
char c = (char) curReadBuf[readBufOffset++];
if (c == 0 && singleConsecZero == false) {
//preserve the current sequence if max consec. 1 zero char
singleConsecZero = true;
} else {
singleConsecZero = false;
}
if (StringExtract.isPrintableAscii(c)) {
tempString.append(c);
++tempStringLen;
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
inString = true;
}
//boundary case when temp has still chars - handled after the loop
} else if (!singleConsecZero) {
//break the string, clear temp
if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
//append entire temp string with new line
tempString.append(NLS);
++tempStringLen;
curString.append(tempString);
curStringLen += tempStringLen;
stringAtBufBoundary = false;
}
//reset temp
tempString = new StringBuilder();
tempStringLen = 0;
}
newCurLen = curStringLen + tempStringLen;
}
//check if still in string state, so that next chars in read buf bypass min chars check
//and qualify as string even if less < min chars required
if (inString) {
inString = false; //reset
stringAtBufBoundary = true; //will bypass the check
}
//check if temp still has chars to qualify as a string
//we might need to break up temp into 2 parts for next read() call
//consume as many as possible to fill entire user buffer
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
if (newCurLen > len) {
int appendChars = len - curStringLen;
//save part for next user read(), need to break up temp string
//do not append new line
String toAppend = tempString.substring(0, appendChars);
String newTemp = tempString.substring(appendChars);
curString.append(toAppend);
curStringLen += appendChars;
tempString = new StringBuilder(newTemp);
tempStringLen = newTemp.length();
stringAtTempBoundary = true;
} else {
//append entire temp
curString.append(tempString);
curStringLen += tempStringLen;
//reset temp
tempString = new StringBuilder();
tempStringLen = 0;
}
} else {
//if temp has a few chars, not qualified as string for now,
//will be processed during next read() call
}
//copy current strings to user
final int copied = copyToReturn(b, off, len);
//there may be still chars in read buffer or tempString, for next read()
return copied;
}
//append temp buffer to cur string buffer and reset temp, if enough chars
//does not append new line
private void appendResetTemp() {
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
curString.append(tempString);
curStringLen += tempStringLen;
tempString = new StringBuilder();
tempStringLen = 0;
}
}
//copy currently extracted string to user buffer
//and reset for next read() call
private int copyToReturn(byte[] b, int off, long len) {
final String curStringS = curString.toString();
//logger.log(Level.INFO, curStringS);
byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
//logger.log(Level.INFO, curStringS);
//copied all string, reset
curString = new StringBuilder();
int ret = curStringLen;
curStringLen = 0;
return ret;
}
@Override
public int read() throws IOException {
final int read = read(oneCharBuf, 0, 1);
if (read == 1) {
return oneCharBuf[0];
} else {
return -1;
}
}
@Override
public int available() throws IOException {
//we don't know how many bytes in curReadBuf may end up as strings
return 0;
}
@Override
public long skip(long n) throws IOException {
//use default implementation that reads into skip buffer
//but it could be more efficient
return super.skip(n);
}
}
/**
* Wrapper over StringExtract to provide streaming API Given AbstractFile
* object, extract international strings from the file and read output as a
* stream of UTF-8 strings as encoded bytes.
*
*/
private static class InternationalStream extends InputStream {
private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
private static final int FILE_BUF_SIZE = 1024 * 1024;
private final AbstractFile content;
private final byte[] oneCharBuf = new byte[1];
private final StringExtract stringExtractor;
/** true if there is nothing to do because neither extractUTF8 nor
* extractUTF16 was true in constructor */
private final boolean nothingToDo;
private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
private long fileReadOffset = 0L;
private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
private int convertBuffOffset = 0; //offset to start returning data to user on next read()
private int bytesInConvertBuff = 0; //amount of data currently in the buffer
private boolean fileEOF = false; //if file has more bytes to read
private StringExtract.StringExtractResult lastExtractResult;
/**
* Constructs new stream object that does conversion from file, to
* extracted strings, then to byte stream, for specified script,
* auto-detected encoding (UTF8, UTF16LE, UTF16BE), and specified output
* byte stream encoding
*
* @param content input content to process and turn into a stream
* to convert into strings
* @param scripts a list of scripts to consider
* @param extractUTF8 whether to extract utf8 encoding
* @param extractUTF16 whether to extract utf16 encoding
*/
private InternationalStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
this.content = content;
this.stringExtractor = new StringExtract();
this.stringExtractor.setEnabledScripts(scripts);
this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
this.stringExtractor.setEnableUTF8(extractUTF8);
this.stringExtractor.setEnableUTF16(extractUTF16);
}
@Override
public int read() throws IOException {
if (nothingToDo) {
return -1;
}
final int read = read(oneCharBuf, 0, 1);
if (read == 1) {
return oneCharBuf[0];
} else {
return -1;
}
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
if (nothingToDo) {
return -1;
}
long fileSize = content.getSize();
if (fileSize == 0) {
return -1;
}
//read and convert until user buffer full
//we have data if file can be read or when byteBuff has converted strings to return
int bytesToUser = 0; //returned to user so far
int offsetUser = off;
while (bytesToUser < len && offsetUser < len) {
//check if we have enough converted strings
int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
try {
//convert more strings, store in buffer
long toRead = 0;
//fill up entire fileReadBuff fresh
toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
//}
int read = content.read(fileReadBuff, fileReadOffset, toRead);
if (read == -1 || read == 0) {
fileEOF = true;
} else {
fileReadOffset += read;
if (fileReadOffset >= fileSize) {
fileEOF = true;
}
//put converted string in convertBuff
convert(read);
convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
}
} catch (TskCoreException ex) {
//Exceptions.printStackTrace(ex);
fileEOF = true;
}
}
//nothing more to read, and no more bytes in convertBuff
if (convertBuff == null || convertBuffRemain == 0) {
if (fileEOF) {
return bytesToUser > 0 ? bytesToUser : -1;
} else {
//no strings extracted, try another read
continue;
}
}
//return part or all of convert buff to user
final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
convertBuffOffset += toCopy;
offsetUser += toCopy;
bytesToUser += toCopy;
}
//if more string data in convertBuff, will be consumed on next read()
return bytesToUser;
}
/**
* convert bytes in file buffer to string, and encode string in
* convertBuffer
*
* @param numBytes num bytes in the fileReadBuff
*/
private void convert(int numBytes) {
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
//reset tracking vars
if (lastExtractResult.getNumBytes() == 0) {
bytesInConvertBuff = 0;
} else {
bytesInConvertBuff = convertBuff.length;
}
convertBuffOffset = 0;
}
}
}

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2012 Basis Technology Corp.
* Copyright 2011-16 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -18,140 +18,76 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.datamodel.AbstractFile;
import java.io.InputStream;
import java.io.Reader;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
/**
* Common methods for utilities that extract text and content and divide into
* chunks
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
* This Reader is given to the Ingester to chunk and index in Solr.
*
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
* is able to process.
*/
interface TextExtractor {
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
/**
* Common options that can be used by some extractors
* Is this extractor configured such that no extraction will/should be done?
*
* @return True if this extractor will/should not perform any extraction.
*/
enum ExtractOptions {
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
};
//generally text extractors should ignore archives
//and let unpacking modules take case of them
static final List<String> ARCHIVE_MIME_TYPES
= Arrays.asList(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS
"application/x-ace-compressed", //NON-NLS
"application/x-alz-compressed", //NON-NLS
"application/x-arj", //NON-NLS
"application/vnd.ms-cab-compressed", //NON-NLS
"application/x-cfs-compressed", //NON-NLS
"application/x-dgc-compressed", //NON-NLS
"application/x-apple-diskimage", //NON-NLS
"application/x-gca-compressed", //NON-NLS
"application/x-dar", //NON-NLS
"application/x-lzx", //NON-NLS
"application/x-lzh", //NON-NLS
"application/x-rar-compressed", //NON-NLS
"application/x-stuffit", //NON-NLS
"application/x-stuffitx", //NON-NLS
"application/x-gtar", //NON-NLS
"application/x-archive", //NON-NLS
"application/x-executable", //NON-NLS
"application/x-gzip", //NON-NLS
"application/zip", //NON-NLS
"application/x-zoo", //NON-NLS
"application/x-cpio", //NON-NLS
"application/x-shar", //NON-NLS
"application/x-tar", //NON-NLS
"application/x-bzip", //NON-NLS
"application/x-bzip2", //NON-NLS
"application/x-lzip", //NON-NLS
"application/x-lzma", //NON-NLS
"application/x-lzop", //NON-NLS
"application/x-z", //NON-NLS
"application/x-compress"); //NON-NLS
abstract boolean isDisabled();
/**
* Get number of chunks resulted from extracting this AbstractFile
* Log the given message and exception as a warning.
*
* @return the number of chunks produced
* @param msg
* @param ex
*/
int getNumChunks();
void logWarning(String msg, Exception ex) {
logger.log(Level.WARNING, msg, ex); //NON-NLS }
}
/**
* Get the source file associated with this extraction
* Get an input stream over the content of the given source.
*
* @return the source AbstractFile
*/
AbstractFile getSourceFile();
/**
* Index the Abstract File
*
* @param sourceFile file to index
*
* @return true if indexed successfully, false otherwise
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;
/**
* Sets the scripts to use for the extraction
*
* @param extractScripts scripts to use
*
* @return true if extractor supports script - specific extraction, false
* otherwise
*/
boolean setScripts(List<SCRIPT> extractScript);
/**
* Get the currently used scripts for extraction
*
* @return scripts currently used or null if not supported
*/
List<SCRIPT> getScripts();
/**
* Get current options
*
* @return currently used, extractor specific options, or null of not
* supported
*/
Map<String, String> getOptions();
/**
* Set extractor specific options
*
* @param options options to use
*/
void setOptions(Map<String, String> options);
/**
* Determines if the extractor works only for specified types is
* supportedTypes() or whether is a generic content extractor (such as
* string extractor)
* @param source
*
* @return
*/
boolean isContentTypeSpecific();
abstract InputStream getInputStream(TextSource source);
/**
* Determines if the file content is supported by the extractor if
* isContentTypeSpecific() returns true.
* Get a reader that over the text extracted from the given source.
*
* @param file to test if its content should be supported
* @param detectedFormat mime-type with detected format (such as text/plain)
* or null if not detected
* @param stream
* @param source
*
* @return true if the file content is supported, false otherwise
* @return
*
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
*/
boolean isSupported(AbstractFile file, String detectedFormat);
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
/**
* Get the 'object' id of the given source.
*
* @param source
*
* @return
*/
abstract long getID(TextSource source);
/**
* Get a human readable name for the given source.
*
* @param source
*
* @return
*/
abstract String getName(TextSource source);
}

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2012-2013 Basis Technology Corp.
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -18,244 +18,85 @@
*/
package org.sleuthkit.autopsy.keywordsearch;
import com.google.common.io.CharSource;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.MissingResourceException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.sleuthkit.autopsy.coreutils.TextUtil;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
/**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
* divided into chunks and indexed with Solr. Protects against Tika parser hangs
* (for unexpected/corrupt content) using a timeout mechanism. If Tika
* extraction succeeds, chunks are indexed with Solr.
*
* This Tika extraction/chunking utility is useful for large files of Tika
* parsers-supported content type.
*
* Extracts text from Tika supported AbstractFile content. Protects against Tika
* parser hangs (for unexpected/corrupt content) using a timeout mechanism.
*/
class TikaTextExtractor implements TextExtractor {
class TikaTextExtractor extends FileTextExtractor {
private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
private static Ingester ingester;
private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
private static final int MAX_EXTR_TEXT_CHARS = 16 * 1024;
private static final int SINGLE_READ_CHARS = 1024;
private static final int EXTRA_CHARS = 128; //for whitespace
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
private AbstractFile sourceFile; //currently processed file
private int numChunks = 0;
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
TikaTextExtractor() {
ingester = Ingester.getDefault();
private static final List<String> TIKA_SUPPORTED_TYPES
= new Tika().getParser().getSupportedTypes(new ParseContext())
.parallelStream()
.map(mt -> mt.getType() + "/" + mt.getSubtype())
.collect(Collectors.toList());
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
for (MediaType mt : mediaTypes) {
TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
}
//logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
@Override
void logWarning(final String msg, Exception ex) {
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
super.logWarning(msg, ex);
}
@Override
public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
return false;
}
@Override
public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
return null;
}
@Override
public Map<String, String> getOptions() {
return null;
}
@Override
public void setOptions(Map<String, String> options) {
}
@Override
public int getNumChunks() {
return numChunks;
}
@Override
public AbstractFile getSourceFile() {
return sourceFile;
}
@Override
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
this.sourceFile = sourceFile;
numChunks = 0; //unknown until indexing is done
boolean success = false;
Reader reader = null;
final InputStream stream = new ReadContentInputStream(sourceFile);
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
Metadata metadata = new Metadata();
//Parse the file in a task, a convenient way to have a timeout...
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
try {
Metadata meta = new Metadata();
//Parse the file in a task
Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
final Future<?> future = tikaParseExecutor.submit(parseTask);
try {
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
//concatenate parsed content and meta data into a single reader.
return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
} catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
sourceFile.getId(), sourceFile.getName());
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
logger.log(Level.WARNING, msg);
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
logWarning(msg, te);
throw new IngesterException(msg);
} catch (Exception ex) {
final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
sourceFile.getId(), sourceFile.getName());
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
logger.log(Level.WARNING, msg);
throw new IngesterException(msg);
}
// get the reader with the results
reader = parseTask.getReader();
if (reader == null) {
//likely due to exception in parse()
logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
return false;
}
// break the results into chunks and index
success = true;
long readSize;
long totalRead = 0;
boolean eof = false;
//we read max 1024 chars at time, this seems to max what this Reader would return
while (!eof) {
if (context.fileIngestIsCancelled()) {
ingester.ingest(this);
return true;
}
readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
if (readSize == -1) {
eof = true;
} else {
totalRead += readSize;
}
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
} else {
//try to read char-by-char until whitespace to not break words
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
&& !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
&& (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
logWarning(msg, ex);
throw new IngesterException(msg, ex);
}
}
// Sanitize by replacing non-UTF-8 characters with caret '^'
for (int i = 0; i < totalRead; ++i) {
if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
textChunkBuf[i] = '^';
}
}
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
sb.append(textChunkBuf, 0, (int) totalRead);
//reset for next chunk
totalRead = 0;
//append meta data if last chunk
if (eof) {
//sort meta data keys
List<String> sortedKeyList = Arrays.asList(meta.names());
Collections.sort(sortedKeyList);
sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
for (String key : sortedKeyList) {
String value = meta.get(key);
sb.append(key).append(": ").append(value).append("\n");
}
}
// Encode from UTF-8 charset to bytes
byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
try {
chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
++this.numChunks;
} catch (Ingester.IngesterException ingEx) {
success = false;
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
throw ingEx; //need to rethrow/return to signal error and move on
}
}
} catch (IOException ex) {
final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
logger.log(Level.WARNING, msg);
success = false;
} catch (Exception ex) {
final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
logger.log(Level.WARNING, msg);
success = false;
} finally {
try {
stream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
}
try {
if (reader != null) {
reader.close();
}
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
}
}
//after all chunks, ingest the parent file without content itself, and store numChunks
ingester.ingest(this);
return success;
/**
* Gets a CharSource that wraps a formated representation of the given
* Metadata.
*
* @param metadata The Metadata to wrap as a CharSource
*
* @return A CharSource for the given MetaData
*/
static private CharSource getMetaDataCharSource(Metadata metadata) {
return CharSource.wrap(
new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
.append(Stream.of(metadata.names()).sorted()
.map(key -> key + ": " + metadata.get(key))
.collect(Collectors.joining("\n"))
));
}
@Override
@ -265,67 +106,64 @@ class TikaTextExtractor implements TextExtractor {
@Override
public boolean isSupported(AbstractFile file, String detectedFormat) {
if (detectedFormat == null) {
if (detectedFormat == null
|| FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|| FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|| detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
return false;
} else if (detectedFormat.equals("application/octet-stream") //NON-NLS
|| detectedFormat.equals("application/x-msdownload")) { //NON-NLS
//any binary unstructured blobs (string extraction will be used)
return false;
} else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
return false;
} //skip video other than flv (tika supports flv only)
else if (detectedFormat.contains("video/") //NON-NLS
&& !detectedFormat.equals("video/x-flv")) { //NON-NLS
return false;
} else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
// Tika currently has a bug in the ttf parser in fontbox.
// It will throw an out of memory exception
}
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
}
@Override
InputStream getInputStream(AbstractFile sourceFile1) {
return new ReadContentInputStream(sourceFile1);
}
@Override
boolean isDisabled() {
return false;
}
//TODO might need to add more mime-types to ignore
//then accept all formats supported by Tika
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
/**
* Return timeout that should be used to index the content.
*
* @param size size of the content
*
* @return time in seconds to use a timeout
*/
private static int getTimeout(long size) {
if (size < 1024 * 1024L) //1MB
{
return 60;
} else if (size < 10 * 1024 * 1024L) //10MB
{
return 1200;
} else if (size < 100 * 1024 * 1024L) //100MB
{
return 3600;
} else {
return 3 * 3600;
}
}
/**
* Runnable task that calls tika to parse the content using the input
* stream. Provides reader for results.
* An implementation of CharSource that just wraps an existing reader and
* returns it in openStream().
*/
private static class ParseRequestTask implements Runnable {
private static class ReaderCharSource extends CharSource {
//in
private Tika tika;
private InputStream stream;
private Metadata meta;
private AbstractFile sourceFile;
//out
private Reader reader;
private final Reader reader;
ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
this.tika = tika;
this.stream = stream;
this.meta = meta;
this.sourceFile = sourceFile;
public ReaderCharSource(Reader reader) {
this.reader = reader;
}
@Override
public void run() {
try {
reader = tika.parse(stream, meta);
} catch (IOException ex) {
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
tika = null;
reader = null;
} catch (Exception ex) {
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
tika = null;
reader = null;
}
}
public Reader getReader() {
public Reader openStream() throws IOException {
return reader;
}
}

View File

@ -35,7 +35,6 @@ import java.util.logging.Level;
import java.util.logging.Logger;
import javax.imageio.ImageIO;
import javax.swing.JDialog;
import javax.swing.JLabel;
import javax.swing.JTextField;
import junit.framework.Test;
import junit.framework.TestCase;
@ -50,10 +49,10 @@ import org.netbeans.jemmy.operators.JComboBoxOperator;
import org.netbeans.jemmy.operators.JDialogOperator;
import org.netbeans.jemmy.operators.JFileChooserOperator;
import org.netbeans.jemmy.operators.JLabelOperator;
import org.netbeans.jemmy.operators.JListOperator;
import org.netbeans.jemmy.operators.JTabbedPaneOperator;
import org.netbeans.jemmy.operators.JTableOperator;
import org.netbeans.jemmy.operators.JTextFieldOperator;
import org.netbeans.jemmy.operators.JListOperator;
import org.netbeans.junit.NbModuleSuite;
import org.sleuthkit.autopsy.ingest.IngestManager;
@ -186,6 +185,8 @@ public class RegressionTest extends TestCase {
String img_path = getEscapedPath(System.getProperty("img_path"));
String imageDir = img_path;
((JTextField) jtfo0.getSource()).setText(imageDir);
JComboBoxOperator comboBoxOperator = new JComboBoxOperator(wo, 1);
comboBoxOperator.setSelectedItem("(GMT-5:00) America/New_York");
wo.btNext().clickMouse();
}