mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-16 17:57:43 +00:00
Merge in develop branch with text extraction refactoring
This commit is contained in:
commit
be7bdced90
@ -1,91 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
|
||||
/**
|
||||
* A representation of a chunk of text from a file that can be used, when
|
||||
* supplied with an Ingester, to index the chunk for search.
|
||||
*/
|
||||
final class AbstractFileChunk {
|
||||
|
||||
private final int chunkNumber;
|
||||
private final TextExtractor textExtractor;
|
||||
|
||||
/**
|
||||
* Constructs a representation of a chunk of text from a file that can be
|
||||
* used, when supplied with an Ingester, to index the chunk for search.
|
||||
*
|
||||
* @param textExtractor A TextExtractor for the file.
|
||||
* @param chunkNumber A sequence number for the chunk.
|
||||
*/
|
||||
AbstractFileChunk(TextExtractor textExtractor, int chunkNumber) {
|
||||
this.textExtractor = textExtractor;
|
||||
this.chunkNumber = chunkNumber;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the TextExtractor for the source file of the text chunk.
|
||||
*
|
||||
* @return A reference to the TextExtractor.
|
||||
*/
|
||||
TextExtractor getTextExtractor() {
|
||||
return textExtractor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the sequence number of the text chunk.
|
||||
*
|
||||
* @return The chunk number.
|
||||
*/
|
||||
int getChunkNumber() {
|
||||
return chunkNumber;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the id of the text chunk.
|
||||
*
|
||||
* @return An id of the form [source file object id]_[chunk number]
|
||||
*/
|
||||
String getChunkId() {
|
||||
return Server.getChunkIdString(this.textExtractor.getSourceFile().getId(), this.chunkNumber);
|
||||
}
|
||||
|
||||
/**
|
||||
* Indexes the text chunk.
|
||||
*
|
||||
* @param ingester An Ingester to do the indexing.
|
||||
* @param chunkBytes The raw bytes of the text chunk.
|
||||
* @param chunkSize The size of the text chunk in bytes.
|
||||
* @param charSet The char set to use during indexing.
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
void index(Ingester ingester, byte[] chunkBytes, long chunkSize, Charset charSet) throws IngesterException {
|
||||
ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, textExtractor.getSourceFile(), charSet);
|
||||
try {
|
||||
ingester.ingest(this, bcs, chunkBytes.length);
|
||||
} catch (Exception ex) {
|
||||
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", getChunkId()), ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,92 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.openide.util.NbBundle;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.sleuthkit.datamodel.AbstractContent;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
|
||||
/**
|
||||
* Wrapper over InputStream that implements ContentStream to feed to Solr.
|
||||
*/
|
||||
class AbstractFileStringContentStream implements ContentStream {
|
||||
//input
|
||||
|
||||
private final AbstractFile content;
|
||||
private final Charset charset;
|
||||
//converted
|
||||
private final InputStream stream;
|
||||
|
||||
public AbstractFileStringContentStream(AbstractFile content, Charset charset, InputStream inputStream) {
|
||||
this.content = content;
|
||||
this.charset = charset;
|
||||
this.stream = inputStream;
|
||||
}
|
||||
|
||||
public AbstractContent getSourceContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType() {
|
||||
return "text/plain;charset=" + charset.name(); //NON-NLS
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return content.getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() throws IOException {
|
||||
return new InputStreamReader(stream);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getSize() {
|
||||
//return convertedLength;
|
||||
throw new UnsupportedOperationException(
|
||||
NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSize.exception.msg"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceInfo() {
|
||||
return NbBundle.getMessage(this.getClass(), "AbstractFileStringContentStream.getSrcInfo.text", content.getId());
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
return stream;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void finalize() throws Throwable {
|
||||
super.finalize();
|
||||
|
||||
stream.close();
|
||||
}
|
||||
}
|
@ -1,213 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2012 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.List;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
* Wrapper over StringExtract to provide streaming API Given AbstractFile
|
||||
* object, extract international strings from the file and read output as a
|
||||
* stream of UTF-8 strings as encoded bytes.
|
||||
*
|
||||
*/
|
||||
class AbstractFileStringIntStream extends InputStream {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
|
||||
private static final int FILE_BUF_SIZE = 1024 * 1024;
|
||||
private AbstractFile content;
|
||||
private final byte[] oneCharBuf = new byte[1];
|
||||
private final StringExtract stringExtractor;
|
||||
private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
|
||||
private long fileReadOffset = 0L;
|
||||
private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
|
||||
private int convertBuffOffset = 0; //offset to start returning data to user on next read()
|
||||
private int bytesInConvertBuff = 0; //amount of data currently in the buffer
|
||||
private boolean fileEOF = false; //if file has more bytes to read
|
||||
private boolean extractUTF8;
|
||||
private boolean extractUTF16;
|
||||
private Charset outCharset;
|
||||
|
||||
private StringExtractResult lastExtractResult;
|
||||
|
||||
/**
|
||||
* Constructs new stream object that does conversion from file, to extracted
|
||||
* strings, then to byte stream, for specified script, auto-detected
|
||||
* encoding (UTF8, UTF16LE, UTF16BE), and specified output byte stream
|
||||
* encoding
|
||||
*
|
||||
* @param content input content to process and turn into a stream to
|
||||
* convert into strings
|
||||
* @param scripts a list of scripts to consider
|
||||
* @param extractUTF8 whether to extract utf8 encoding
|
||||
* @param extractUTF16 whether to extract utf16 encoding
|
||||
* @param outCharset encoding to use in the output byte stream
|
||||
*/
|
||||
public AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8,
|
||||
boolean extractUTF16, Charset outCharset) {
|
||||
this.content = content;
|
||||
this.stringExtractor = new StringExtract();
|
||||
this.stringExtractor.setEnabledScripts(scripts);
|
||||
this.extractUTF8 = extractUTF8;
|
||||
this.extractUTF16 = extractUTF16;
|
||||
this.outCharset = outCharset;
|
||||
this.stringExtractor.setEnableUTF8(extractUTF8);
|
||||
this.stringExtractor.setEnableUTF16(extractUTF16);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
if (extractUTF8 == false && extractUTF16 == false) {
|
||||
return -1;
|
||||
}
|
||||
final int read = read(oneCharBuf, 0, 1);
|
||||
if (read == 1) {
|
||||
return oneCharBuf[0];
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
if (b == null) {
|
||||
throw new NullPointerException();
|
||||
} else if (off < 0 || len < 0 || len > b.length - off) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
} else if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (extractUTF8 == false && extractUTF16 == false) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
long fileSize = content.getSize();
|
||||
if (fileSize == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
//read and convert until user buffer full
|
||||
//we have data if file can be read or when byteBuff has converted strings to return
|
||||
int bytesToUser = 0; //returned to user so far
|
||||
int offsetUser = off;
|
||||
while (bytesToUser < len && offsetUser < len) {
|
||||
//check if we have enough converted strings
|
||||
int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
|
||||
|
||||
if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
|
||||
try {
|
||||
//convert more strings, store in buffer
|
||||
long toRead = 0;
|
||||
//int shiftSize = 0;
|
||||
|
||||
//if (lastExtractResult != null && lastExtractResult.getTextLength() != 0
|
||||
// && (shiftSize = FILE_BUF_SIZE - lastExtractResult.getFirstUnprocessedOff()) > 0) {
|
||||
////a string previously extracted
|
||||
////shift the fileReadBuff past last bytes extracted
|
||||
////read only what's needed to fill the buffer
|
||||
////to avoid loosing chars and breaking or corrupting potential strings - preserve byte stream continuity
|
||||
//byte[] temp = new byte[shiftSize];
|
||||
//System.arraycopy(fileReadBuff, lastExtractResult.getFirstUnprocessedOff(),
|
||||
// temp, 0, shiftSize);
|
||||
//System.arraycopy(temp, 0, fileReadBuff, 0, shiftSize);
|
||||
//toRead = Math.min(lastExtractResult.getFirstUnprocessedOff(), fileSize - fileReadOffset);
|
||||
//lastExtractResult = null;
|
||||
//} else {
|
||||
//fill up entire fileReadBuff fresh
|
||||
toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
|
||||
//}
|
||||
int read = content.read(fileReadBuff, fileReadOffset, toRead);
|
||||
if (read == -1 || read == 0) {
|
||||
fileEOF = true;
|
||||
} else {
|
||||
fileReadOffset += read;
|
||||
if (fileReadOffset >= fileSize) {
|
||||
fileEOF = true;
|
||||
}
|
||||
|
||||
//put converted string in convertBuff
|
||||
convert(read);
|
||||
convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
|
||||
}
|
||||
} catch (TskCoreException ex) {
|
||||
//Exceptions.printStackTrace(ex);
|
||||
fileEOF = true;
|
||||
}
|
||||
}
|
||||
|
||||
//nothing more to read, and no more bytes in convertBuff
|
||||
if (convertBuff == null || convertBuffRemain == 0) {
|
||||
if (fileEOF) {
|
||||
return bytesToUser > 0 ? bytesToUser : -1;
|
||||
} else {
|
||||
//no strings extracted, try another read
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
//return part or all of convert buff to user
|
||||
final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
|
||||
System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
|
||||
|
||||
//DEBUG
|
||||
/*
|
||||
* if (toCopy > 0) { FileOutputStream debug = new
|
||||
* FileOutputStream("c:\\temp\\" + content.getName(), true);
|
||||
* debug.write(b, offsetUser, toCopy); debug.close(); }
|
||||
*/
|
||||
convertBuffOffset += toCopy;
|
||||
offsetUser += toCopy;
|
||||
|
||||
bytesToUser += toCopy;
|
||||
|
||||
}
|
||||
|
||||
//if more string data in convertBuff, will be consumed on next read()
|
||||
return bytesToUser;
|
||||
}
|
||||
|
||||
/**
|
||||
* convert bytes in file buffer to string, and encode string in
|
||||
* convertBuffer
|
||||
*
|
||||
* @param numBytes num bytes in the fileReadBuff
|
||||
*/
|
||||
private void convert(int numBytes) {
|
||||
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
|
||||
convertBuff = lastExtractResult.getText().getBytes(outCharset);
|
||||
|
||||
//reset tracking vars
|
||||
if (lastExtractResult.getNumBytes() == 0) {
|
||||
bytesInConvertBuff = 0;
|
||||
} else {
|
||||
bytesInConvertBuff = convertBuff.length;
|
||||
}
|
||||
convertBuffOffset = 0;
|
||||
}
|
||||
}
|
@ -1,296 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2012 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.TskException;
|
||||
|
||||
/**
|
||||
* AbstractFile input string stream reader/converter - given AbstractFile,
|
||||
* extract strings from it and return encoded bytes via read()
|
||||
*
|
||||
* Note: the utility supports extraction of only LATIN script and UTF8, UTF16LE,
|
||||
* UTF16BE encodings and uses a brute force encoding detection - it's fast but
|
||||
* could apply multiple encodings on the same string.
|
||||
*
|
||||
* For other script/languages support and better encoding detection use
|
||||
* AbstractFileStringIntStream streaming class, which wraps around StringExtract
|
||||
* extractor.
|
||||
*/
|
||||
class AbstractFileStringStream extends InputStream {
|
||||
|
||||
//args
|
||||
private AbstractFile content;
|
||||
private Charset outputCharset;
|
||||
//internal data
|
||||
private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName());
|
||||
private static final String NLS = Character.toString((char) 10); //new line
|
||||
private static final int READ_BUF_SIZE = 256;
|
||||
private long contentOffset = 0; //offset in fscontent read into curReadBuf
|
||||
private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
|
||||
private int bytesInReadBuf = 0;
|
||||
private int readBufOffset = 0; //offset in read buf processed
|
||||
private StringBuilder curString = new StringBuilder();
|
||||
private int curStringLen = 0;
|
||||
private StringBuilder tempString = new StringBuilder();
|
||||
private int tempStringLen = 0;
|
||||
private boolean isEOF = false;
|
||||
private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
|
||||
private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
|
||||
private boolean inString = false; //if current temp has min chars required
|
||||
private final byte[] oneCharBuf = new byte[1];
|
||||
private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
|
||||
|
||||
/**
|
||||
* Construct new string stream from FsContent
|
||||
*
|
||||
* @param content to extract strings from
|
||||
* @param outputCharset target encoding to index as
|
||||
* @param preserveOnBuffBoundary whether to preserve or split string on a
|
||||
* buffer boundary. If false, will pack into
|
||||
* read buffer up to max. possible,
|
||||
* potentially splitting a string. If false,
|
||||
* the string will be preserved for next read.
|
||||
*/
|
||||
public AbstractFileStringStream(AbstractFile content, Charset outputCharset, boolean preserveOnBuffBoundary) {
|
||||
this.content = content;
|
||||
this.outputCharset = outputCharset;
|
||||
//this.preserveOnBuffBoundary = preserveOnBuffBoundary;
|
||||
//logger.log(Level.INFO, "FILE: " + content.getParentPath() + "/" + content.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct new string stream from FsContent Do not attempt to fill entire
|
||||
* read buffer if that would break a string
|
||||
*
|
||||
* @param content to extract strings from
|
||||
* @param outCharset target charset to encode into bytes and index as, e.g.
|
||||
* UTF-8
|
||||
*/
|
||||
public AbstractFileStringStream(AbstractFile content, Charset outCharset) {
|
||||
this(content, outCharset, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
if (b == null) {
|
||||
throw new NullPointerException();
|
||||
} else if (off < 0 || len < 0 || len > b.length - off) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
} else if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
long fileSize = content.getSize();
|
||||
if (fileSize == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (isEOF) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (stringAtTempBoundary) {
|
||||
//append entire temp string residual from previous read()
|
||||
//because qualified string was broken down into 2 parts
|
||||
appendResetTemp();
|
||||
|
||||
stringAtTempBoundary = false;
|
||||
//there could be more to this string in fscontent/buffer
|
||||
}
|
||||
|
||||
boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
|
||||
int newCurLen = curStringLen + tempStringLen;
|
||||
|
||||
while (newCurLen < len) {
|
||||
//need to extract more strings
|
||||
if (readBufOffset > bytesInReadBuf - 1) {
|
||||
//no more bytes to process into strings, read them
|
||||
try {
|
||||
bytesInReadBuf = 0;
|
||||
bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
|
||||
} catch (TskException ex) {
|
||||
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
appendResetTemp();
|
||||
//have some extracted string, return that, and fail next time
|
||||
isEOF = true;
|
||||
int copied = copyToReturn(b, off, len);
|
||||
return copied;
|
||||
} else {
|
||||
return -1; //EOF
|
||||
}
|
||||
}
|
||||
if (bytesInReadBuf < 1) {
|
||||
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
appendResetTemp();
|
||||
//have some extracted string, return that, and fail next time
|
||||
isEOF = true;
|
||||
int copied = copyToReturn(b, off, len);
|
||||
return copied;
|
||||
} else {
|
||||
return -1; //EOF
|
||||
}
|
||||
}
|
||||
//increment content offset for next read
|
||||
contentOffset += bytesInReadBuf;
|
||||
//reset read buf position
|
||||
readBufOffset = 0;
|
||||
}
|
||||
//get char from cur read buf
|
||||
char c = (char) curReadBuf[readBufOffset++];
|
||||
if (c == 0 && singleConsecZero == false) {
|
||||
//preserve the current sequence if max consec. 1 zero char
|
||||
singleConsecZero = true;
|
||||
} else {
|
||||
singleConsecZero = false;
|
||||
}
|
||||
if (StringExtract.isPrintableAscii(c)) {
|
||||
tempString.append(c);
|
||||
++tempStringLen;
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
inString = true;
|
||||
}
|
||||
|
||||
//boundary case when temp has still chars - handled after the loop
|
||||
} else if (!singleConsecZero) {
|
||||
//break the string, clear temp
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS
|
||||
|| stringAtBufBoundary) {
|
||||
//append entire temp string with new line
|
||||
tempString.append(NLS);
|
||||
++tempStringLen;
|
||||
|
||||
curString.append(tempString);
|
||||
curStringLen += tempStringLen;
|
||||
|
||||
stringAtBufBoundary = false;
|
||||
}
|
||||
//reset temp
|
||||
tempString = new StringBuilder();
|
||||
tempStringLen = 0;
|
||||
}
|
||||
|
||||
newCurLen = curStringLen + tempStringLen;
|
||||
}
|
||||
|
||||
//check if still in string state, so that next chars in read buf bypass min chars check
|
||||
//and qualify as string even if less < min chars required
|
||||
if (inString) {
|
||||
inString = false; //reset
|
||||
stringAtBufBoundary = true; //will bypass the check
|
||||
}
|
||||
|
||||
//check if temp still has chars to qualify as a string
|
||||
//we might need to break up temp into 2 parts for next read() call
|
||||
//consume as many as possible to fill entire user buffer
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
if (newCurLen > len) {
|
||||
int appendChars = len - curStringLen;
|
||||
//save part for next user read(), need to break up temp string
|
||||
//do not append new line
|
||||
String toAppend = tempString.substring(0, appendChars);
|
||||
String newTemp = tempString.substring(appendChars);
|
||||
|
||||
curString.append(toAppend);
|
||||
curStringLen += appendChars;
|
||||
|
||||
tempString = new StringBuilder(newTemp);
|
||||
tempStringLen = newTemp.length();
|
||||
|
||||
stringAtTempBoundary = true;
|
||||
|
||||
} else {
|
||||
//append entire temp
|
||||
curString.append(tempString);
|
||||
curStringLen += tempStringLen;
|
||||
|
||||
//reset temp
|
||||
tempString = new StringBuilder();
|
||||
tempStringLen = 0;
|
||||
|
||||
}
|
||||
} else {
|
||||
//if temp has a few chars, not qualified as string for now,
|
||||
//will be processed during next read() call
|
||||
}
|
||||
|
||||
//copy current strings to user
|
||||
final int copied = copyToReturn(b, off, len);
|
||||
//there may be still chars in read buffer or tempString, for next read()
|
||||
|
||||
return copied;
|
||||
}
|
||||
|
||||
//append temp buffer to cur string buffer and reset temp, if enough chars
|
||||
//does not append new line
|
||||
private void appendResetTemp() {
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
curString.append(tempString);
|
||||
curStringLen += tempStringLen;
|
||||
tempString = new StringBuilder();
|
||||
tempStringLen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//copy currently extracted string to user buffer
|
||||
//and reset for next read() call
|
||||
private int copyToReturn(byte[] b, int off, long len) {
|
||||
|
||||
final String curStringS = curString.toString();
|
||||
//logger.log(Level.INFO, curStringS);
|
||||
byte[] stringBytes = curStringS.getBytes(outputCharset);
|
||||
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
|
||||
//logger.log(Level.INFO, curStringS);
|
||||
//copied all string, reset
|
||||
curString = new StringBuilder();
|
||||
int ret = curStringLen;
|
||||
curStringLen = 0;
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
final int read = read(oneCharBuf, 0, 1);
|
||||
if (read == 1) {
|
||||
return oneCharBuf[0];
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int available() throws IOException {
|
||||
//we don't know how many bytes in curReadBuf may end up as strings
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
//use default implementation that reads into skip buffer
|
||||
//but it could be more efficient
|
||||
return super.skip(n);
|
||||
}
|
||||
}
|
@ -0,0 +1,143 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.sleuthkit.autopsy.casemodule.Case;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
* Extracts text from artifacts by concatenating the values of all of the
|
||||
* artifact's attributes.
|
||||
*/
|
||||
public class ArtifactTextExtractor extends TextExtractor<BlackboardArtifact> {
|
||||
static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName());
|
||||
|
||||
/**
|
||||
* Get the Content that is the data source for the given artifact. //JMTODO:
|
||||
* is there a prexisting method to do this?
|
||||
*
|
||||
* @param artifact
|
||||
*
|
||||
* @return The data source for the given artifact as a Content object, or
|
||||
* null if it could not be found.
|
||||
*
|
||||
* @throws TskCoreException if there is a problem accessing the case db.
|
||||
*/
|
||||
static Content getDataSource(BlackboardArtifact artifact) throws TskCoreException {
|
||||
|
||||
Case currentCase;
|
||||
try {
|
||||
currentCase = Case.getCurrentCase();
|
||||
} catch (IllegalStateException ignore) {
|
||||
// thorown by Case.getCurrentCase() if currentCase is null
|
||||
return null;
|
||||
}
|
||||
|
||||
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
|
||||
if (sleuthkitCase == null) {
|
||||
return null;
|
||||
|
||||
}
|
||||
Content dataSource;
|
||||
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
|
||||
if (abstractFile != null) {
|
||||
dataSource = abstractFile.getDataSource();
|
||||
} else {
|
||||
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
|
||||
}
|
||||
|
||||
if (dataSource == null) {
|
||||
return null;
|
||||
}
|
||||
return dataSource;
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(BlackboardArtifact artifact) {
|
||||
// Concatenate the string values of all attributes into a single
|
||||
// "content" string to be indexed.
|
||||
StringBuilder artifactContents = new StringBuilder();
|
||||
|
||||
try {
|
||||
Content dataSource = getDataSource(artifact);
|
||||
if (dataSource == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||
artifactContents.append(" : ");
|
||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||
// to magically format datetime attributes but that is complicated by
|
||||
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
||||
// while the utility to determine the timezone to use is in ContentUtils
|
||||
// in the Autopsy datamodel.
|
||||
switch (attribute.getValueType()) {
|
||||
case DATETIME:
|
||||
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
||||
break;
|
||||
default:
|
||||
artifactContents.append(attribute.getDisplayString());
|
||||
}
|
||||
artifactContents.append(System.lineSeparator());
|
||||
}
|
||||
} catch (TskCoreException ex) {
|
||||
logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex);
|
||||
return null;
|
||||
}
|
||||
if (artifactContents.length() == 0) {
|
||||
return null;
|
||||
}
|
||||
return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(InputStream stream, BlackboardArtifact source) throws Ingester.IngesterException {
|
||||
return new InputStreamReader(stream, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
long getID(BlackboardArtifact source) {
|
||||
return source.getArtifactID();
|
||||
}
|
||||
|
||||
@Override
|
||||
String getName(BlackboardArtifact source) {
|
||||
return source.getDisplayName() + "_" + source.getArtifactID();
|
||||
}
|
||||
}
|
@ -1,102 +0,0 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.sleuthkit.datamodel.AbstractContent;
|
||||
|
||||
/**
|
||||
* Stream of bytes representing string with specified encoding to feed into Solr
|
||||
* as ContentStream
|
||||
*/
|
||||
class ByteContentStream implements ContentStream {
|
||||
|
||||
//input
|
||||
private byte[] content; //extracted subcontent
|
||||
private long contentSize;
|
||||
private AbstractContent aContent; //origin
|
||||
private Charset charset; //output byte stream charset of encoded strings
|
||||
|
||||
private InputStream stream;
|
||||
|
||||
private static Logger logger = Logger.getLogger(ByteContentStream.class.getName());
|
||||
|
||||
public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent, Charset charset) {
|
||||
this.content = content;
|
||||
this.aContent = aContent;
|
||||
this.charset = charset;
|
||||
stream = new ByteArrayInputStream(content, 0, (int) contentSize);
|
||||
}
|
||||
|
||||
public byte[] getByteContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
public AbstractContent getSourceContent() {
|
||||
return aContent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType() {
|
||||
return "text/plain;charset=" + charset.name(); //NON-NLS
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return aContent.getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() throws IOException {
|
||||
return new InputStreamReader(stream);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getSize() {
|
||||
return contentSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceInfo() {
|
||||
return NbBundle.getMessage(this.getClass(), "ByteContentStream.getSrcInfo.text", aContent.getId());
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
return stream;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void finalize() throws Throwable {
|
||||
super.finalize();
|
||||
|
||||
stream.close();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
|
||||
/**
|
||||
* Common methods for utilities that extract text and content and divide into
|
||||
* chunks
|
||||
*/
|
||||
abstract class FileTextExtractor extends TextExtractor< AbstractFile> {
|
||||
|
||||
|
||||
static final List<String> BLOB_MIME_TYPES
|
||||
= Arrays.asList(
|
||||
//ignore binary blob data, for which string extraction will be used
|
||||
"application/octet-stream", //NON-NLS
|
||||
"application/x-msdownload"); //NON-NLS
|
||||
|
||||
/** generally text extractors should ignore archives and let unpacking
|
||||
* modules take care of them */
|
||||
static final List<String> ARCHIVE_MIME_TYPES
|
||||
= Arrays.asList(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
"application/x-7z-compressed", //NON-NLS
|
||||
"application/x-ace-compressed", //NON-NLS
|
||||
"application/x-alz-compressed", //NON-NLS
|
||||
"application/x-arj", //NON-NLS
|
||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||
"application/x-cfs-compressed", //NON-NLS
|
||||
"application/x-dgc-compressed", //NON-NLS
|
||||
"application/x-apple-diskimage", //NON-NLS
|
||||
"application/x-gca-compressed", //NON-NLS
|
||||
"application/x-dar", //NON-NLS
|
||||
"application/x-lzx", //NON-NLS
|
||||
"application/x-lzh", //NON-NLS
|
||||
"application/x-rar-compressed", //NON-NLS
|
||||
"application/x-stuffit", //NON-NLS
|
||||
"application/x-stuffitx", //NON-NLS
|
||||
"application/x-gtar", //NON-NLS
|
||||
"application/x-archive", //NON-NLS
|
||||
"application/x-executable", //NON-NLS
|
||||
"application/x-gzip", //NON-NLS
|
||||
"application/zip", //NON-NLS
|
||||
"application/x-zoo", //NON-NLS
|
||||
"application/x-cpio", //NON-NLS
|
||||
"application/x-shar", //NON-NLS
|
||||
"application/x-tar", //NON-NLS
|
||||
"application/x-bzip", //NON-NLS
|
||||
"application/x-bzip2", //NON-NLS
|
||||
"application/x-lzip", //NON-NLS
|
||||
"application/x-lzma", //NON-NLS
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
|
||||
/**
|
||||
* Determines if the extractor works only for specified types is
|
||||
* supportedTypes() or whether is a generic content extractor (such as
|
||||
* string extractor)
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
abstract boolean isContentTypeSpecific();
|
||||
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor if
|
||||
* isContentTypeSpecific() returns true.
|
||||
*
|
||||
* @param file to test if its content should be supported
|
||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||
* or null if not detected
|
||||
*
|
||||
* @return true if the file content is supported, false otherwise
|
||||
*/
|
||||
abstract boolean isSupported(AbstractFile file, String detectedFormat);
|
||||
|
||||
@Override
|
||||
abstract Reader getReader(InputStream stream, AbstractFile source) throws Ingester.IngesterException;
|
||||
|
||||
@Override
|
||||
long getID(AbstractFile source) {
|
||||
return source.getId();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
String getName(AbstractFile source) {
|
||||
return source.getName();
|
||||
}
|
||||
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2012-2013 Basis Technology Corp.
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -21,36 +21,23 @@ package org.sleuthkit.autopsy.keywordsearch;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import net.htmlparser.jericho.Attributes;
|
||||
import net.htmlparser.jericho.Renderer;
|
||||
import net.htmlparser.jericho.Source;
|
||||
import net.htmlparser.jericho.StartTag;
|
||||
import net.htmlparser.jericho.StartTagType;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extractor of text from HTML supported AbstractFile content. Extracted text is
|
||||
* divided into chunks and indexed with Solr. If HTML extraction succeeds,
|
||||
* chunks are indexed with Solr.
|
||||
* Extracts text from AbstractFile HTML content.
|
||||
*/
|
||||
class HtmlTextExtractor implements TextExtractor {
|
||||
class HtmlTextExtractor extends FileTextExtractor {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
|
||||
private static Ingester ingester;
|
||||
static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
|
||||
static final int MAX_EXTR_TEXT_CHARS = 31 * 1024;
|
||||
private static final int SINGLE_READ_CHARS = 1024;
|
||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||
private static final int MAX_SIZE = 50000000;
|
||||
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
|
||||
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
private AbstractFile sourceFile;
|
||||
private int numChunks = 0;
|
||||
private static final int MAX_SIZE = 50_000_000; //50MB
|
||||
|
||||
static final List<String> WEB_MIME_TYPES = Arrays.asList(
|
||||
"application/javascript", //NON-NLS
|
||||
@ -59,170 +46,124 @@ class HtmlTextExtractor implements TextExtractor {
|
||||
"text/css", //NON-NLS
|
||||
"text/html", //NON-NLS NON-NLS
|
||||
"text/javascript" //NON-NLS
|
||||
//"application/xml",
|
||||
//"application/xml-dtd",
|
||||
);
|
||||
|
||||
HtmlTextExtractor() {
|
||||
ingester = Ingester.getDefault();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setScripts(List<SCRIPT> extractScripts) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SCRIPT> getScripts() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getOptions() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOptions(Map<String, String> options) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumChunks() {
|
||||
return numChunks;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractFile getSourceFile() {
|
||||
return sourceFile;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
numChunks = 0; //unknown until indexing is done
|
||||
|
||||
boolean success = false;
|
||||
Reader reader = null;
|
||||
|
||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||
|
||||
try {
|
||||
// Parse the stream with Jericho
|
||||
JerichoParserWrapper jpw = new JerichoParserWrapper(stream);
|
||||
jpw.parse();
|
||||
reader = jpw.getReader();
|
||||
|
||||
// In case there is an exception or parse() isn't called
|
||||
if (reader == null) {
|
||||
logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS
|
||||
return false;
|
||||
}
|
||||
|
||||
success = true;
|
||||
long readSize;
|
||||
long totalRead = 0;
|
||||
boolean eof = false;
|
||||
//we read max 1024 chars at time, this seems to max what this Reader would return
|
||||
while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
ingester.ingest(this);
|
||||
return true;
|
||||
}
|
||||
totalRead += readSize;
|
||||
|
||||
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
|
||||
while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
|
||||
&& (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
|
||||
totalRead += readSize;
|
||||
}
|
||||
if (readSize == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
} else {
|
||||
//try to read until whitespace to not break words
|
||||
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
|
||||
&& !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
|
||||
&& (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
|
||||
totalRead += readSize;
|
||||
}
|
||||
if (readSize == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
}
|
||||
}
|
||||
|
||||
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
|
||||
//encode to bytes to index as byte stream
|
||||
String extracted;
|
||||
|
||||
//add BOM and trim the 0 bytes
|
||||
//set initial size to chars read + bom - try to prevent from resizing
|
||||
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
|
||||
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
|
||||
//sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
|
||||
if (totalRead < MAX_EXTR_TEXT_CHARS) {
|
||||
sb.append(textChunkBuf, 0, (int) totalRead);
|
||||
} else {
|
||||
sb.append(textChunkBuf);
|
||||
}
|
||||
|
||||
//reset for next chunk
|
||||
totalRead = 0;
|
||||
extracted = sb.toString();
|
||||
|
||||
//converts BOM automatically to charSet encoding
|
||||
byte[] encodedBytes = extracted.getBytes(outCharset);
|
||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||
try {
|
||||
chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
|
||||
++this.numChunks;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
success = false;
|
||||
logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS
|
||||
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
|
||||
throw ingEx; //need to rethrow/return to signal error and move on
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
|
||||
success = false;
|
||||
} catch (Exception ex) {
|
||||
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
|
||||
success = false;
|
||||
} finally {
|
||||
try {
|
||||
stream.close();
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
|
||||
}
|
||||
try {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
|
||||
}
|
||||
}
|
||||
|
||||
//after all chunks, ingest the parent file without content itself, and store numChunks
|
||||
ingester.ingest(this);
|
||||
return success;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||
if (detectedFormat == null) {
|
||||
return false;
|
||||
} else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||
return detectedFormat != null
|
||||
&& WEB_MIME_TYPES.contains(detectedFormat)
|
||||
&& file.getSize() <= MAX_SIZE;
|
||||
}
|
||||
|
||||
@Override
|
||||
Reader getReader(InputStream in, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
//Parse the stream with Jericho and put the results in a Reader
|
||||
try {
|
||||
StringBuilder scripts = new StringBuilder();
|
||||
StringBuilder links = new StringBuilder();
|
||||
StringBuilder images = new StringBuilder();
|
||||
StringBuilder comments = new StringBuilder();
|
||||
StringBuilder others = new StringBuilder();
|
||||
int numScripts = 0;
|
||||
int numLinks = 0;
|
||||
int numImages = 0;
|
||||
int numComments = 0;
|
||||
int numOthers = 0;
|
||||
|
||||
Source source = new Source(in);
|
||||
source.fullSequentialParse();
|
||||
Renderer renderer = source.getRenderer();
|
||||
renderer.setNewLine("\n");
|
||||
renderer.setIncludeHyperlinkURLs(false);
|
||||
renderer.setDecorateFontStyles(false);
|
||||
renderer.setIncludeAlternateText(false);
|
||||
|
||||
String text = renderer.toString();
|
||||
// Get all the tags in the source
|
||||
List<StartTag> tags = source.getAllStartTags();
|
||||
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
for (StartTag tag : tags) {
|
||||
if (tag.getName().equals("script")) { //NON-NLS
|
||||
// If the <script> tag has attributes
|
||||
numScripts++;
|
||||
scripts.append(numScripts).append(") ");
|
||||
if (tag.getTagContent().length() > 0) {
|
||||
scripts.append(tag.getTagContent()).append(" ");
|
||||
}
|
||||
// Get whats between the <script> .. </script> tags
|
||||
scripts.append(tag.getElement().getContent()).append("\n");
|
||||
|
||||
} else if (tag.getName().equals("a")) {
|
||||
//NON-NLS
|
||||
numLinks++;
|
||||
links.append(numLinks).append(") ");
|
||||
links.append(tag.getTagContent()).append("\n");
|
||||
|
||||
} else if (tag.getName().equals("img")) {
|
||||
//NON-NLS
|
||||
numImages++;
|
||||
images.append(numImages).append(") ");
|
||||
images.append(tag.getTagContent()).append("\n");
|
||||
|
||||
} else if (tag.getTagType().equals(StartTagType.COMMENT)) {
|
||||
numComments++;
|
||||
comments.append(numComments).append(") ");
|
||||
comments.append(tag.getTagContent()).append("\n");
|
||||
|
||||
} else {
|
||||
// Make sure it has an attribute
|
||||
Attributes atts = tag.getAttributes();
|
||||
if (atts != null && atts.length() > 0) {
|
||||
numOthers++;
|
||||
others.append(numOthers).append(") ");
|
||||
others.append(tag.getName()).append(":");
|
||||
others.append(tag.getTagContent()).append("\n");
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
stringBuilder.append(text).append("\n\n");
|
||||
stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
|
||||
if (numScripts > 0) {
|
||||
stringBuilder.append("---Scripts---\n"); //NON-NLS
|
||||
stringBuilder.append(scripts).append("\n");
|
||||
}
|
||||
if (numLinks > 0) {
|
||||
stringBuilder.append("---Links---\n"); //NON-NLS
|
||||
stringBuilder.append(links).append("\n");
|
||||
}
|
||||
if (numImages > 0) {
|
||||
stringBuilder.append("---Images---\n"); //NON-NLS
|
||||
stringBuilder.append(images).append("\n");
|
||||
}
|
||||
if (numComments > 0) {
|
||||
stringBuilder.append("---Comments---\n"); //NON-NLS
|
||||
stringBuilder.append(comments).append("\n");
|
||||
}
|
||||
if (numOthers > 0) {
|
||||
stringBuilder.append("---Others---\n"); //NON-NLS
|
||||
stringBuilder.append(others).append("\n");
|
||||
}
|
||||
// All done, now make it a reader
|
||||
return new StringReader(stringBuilder.toString());
|
||||
} catch (IOException ex) {
|
||||
throw new Ingester.IngesterException("Error extracting HTML from content.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||
return new ReadContentInputStream(sourceFile1);
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -18,49 +18,45 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.TextUtil;
|
||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
import org.sleuthkit.datamodel.AbstractContent;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.ContentVisitor;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.DerivedFile;
|
||||
import org.sleuthkit.datamodel.Directory;
|
||||
import org.sleuthkit.datamodel.File;
|
||||
import org.sleuthkit.datamodel.LayoutFile;
|
||||
import org.sleuthkit.datamodel.LocalFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
import org.sleuthkit.datamodel.SlackFile;
|
||||
import org.sleuthkit.datamodel.SleuthkitItemVisitor;
|
||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
* Handles indexing files on a Solr core.
|
||||
*/
|
||||
//JMTODO: Should this class really be a singleton?
|
||||
class Ingester {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(Ingester.class.getName());
|
||||
private volatile boolean uncommitedIngests = false;
|
||||
private final Server solrServer = KeywordSearch.getServer();
|
||||
private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
|
||||
private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
|
||||
private static Ingester instance;
|
||||
|
||||
//for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
|
||||
//TODO use a streaming way to add content to /update handler
|
||||
private static final int MAX_DOC_CHUNK_SIZE = 32 * 1024;
|
||||
private static final String ENCODING = "UTF-8"; //NON-NLS
|
||||
private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; //chars
|
||||
private static final int SINGLE_READ_CHARS = 1024;
|
||||
private static final int EXTRA_CHARS = 128;
|
||||
|
||||
private Ingester() {
|
||||
}
|
||||
@ -72,6 +68,7 @@ class Ingester {
|
||||
return instance;
|
||||
}
|
||||
|
||||
//JMTODO: this is probably useless
|
||||
@Override
|
||||
@SuppressWarnings("FinalizeDeclaration")
|
||||
protected void finalize() throws Throwable {
|
||||
@ -84,123 +81,68 @@ class Ingester {
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a stream to Solr to have its content extracted and added to the
|
||||
* index. commit() should be called once you're done ingesting files.
|
||||
* Sends the metadata (name, MAC times, image id, etc) for the given file to
|
||||
* Solr to be added to the index. commit() should be called once you're done
|
||||
* indexing.
|
||||
*
|
||||
* @param afscs File AbstractFileStringContentStream to ingest
|
||||
* @param file File to index.
|
||||
*
|
||||
* @throws IngesterException if there was an error processing a specific
|
||||
* file, but the Solr server is probably fine.
|
||||
*/
|
||||
void ingest(AbstractFileStringContentStream afscs) throws IngesterException {
|
||||
Map<String, String> params = getContentFields(afscs.getSourceContent());
|
||||
ingest(afscs, params, afscs.getSourceContent().getSize());
|
||||
void indexMetaDataOnly(AbstractFile file) throws IngesterException {
|
||||
indexChunk("", file.getName(), getContentFields(file));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a TextExtractor to Solr to have its content extracted and added to
|
||||
* the index. commit() should be called once you're done ingesting files.
|
||||
* FileExtract represents a parent of extracted file with actual content.
|
||||
* The parent itself has no content, only meta data and is used to associate
|
||||
* the extracted AbstractFileChunk
|
||||
* Sends the metadata (artifact id, image id, etc) for the given artifact to
|
||||
* Solr to be added to the index. commit() should be called once you're done
|
||||
* indexing.
|
||||
*
|
||||
* @param fe TextExtractor to ingest
|
||||
* @param artifact The artifact to index.
|
||||
*
|
||||
* @throws IngesterException if there was an error processing a specific
|
||||
* file, but the Solr server is probably fine.
|
||||
* artifact, but the Solr server is probably fine.
|
||||
*/
|
||||
void ingest(TextExtractor fe) throws IngesterException {
|
||||
Map<String, String> params = getContentFields(fe.getSourceFile());
|
||||
|
||||
params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
|
||||
|
||||
ingest(new NullContentStream(fe.getSourceFile()), params, 0);
|
||||
void indexMetaDataOnly(BlackboardArtifact artifact) throws IngesterException {
|
||||
indexChunk("", new ArtifactTextExtractor().getName(artifact), getContentFields(artifact));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a AbstractFileChunk to Solr and its extracted content stream to be
|
||||
* added to the index. commit() should be called once you're done ingesting
|
||||
* files. AbstractFileChunk represents a file chunk and its chunk content.
|
||||
* Creates a field map from a SleuthkitVisitableItem, that is later sent to
|
||||
* Solr.
|
||||
*
|
||||
* @param fec AbstractFileChunk to ingest
|
||||
* @param size approx. size of the stream in bytes, used for timeout
|
||||
* estimation
|
||||
* @param item SleuthkitVisitableItem to get fields from
|
||||
*
|
||||
* @throws IngesterException if there was an error processing a specific
|
||||
* file, but the Solr server is probably fine.
|
||||
* @return the map from field name to value (as a string)
|
||||
*/
|
||||
void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException {
|
||||
AbstractContent sourceContent = bcs.getSourceContent();
|
||||
Map<String, String> params = getContentFields(sourceContent);
|
||||
|
||||
//overwrite id with the chunk id
|
||||
params.put(Server.Schema.ID.toString(),
|
||||
Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber()));
|
||||
|
||||
ingest(bcs, params, size);
|
||||
private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
|
||||
return item.accept(SOLR_FIELDS_VISITOR);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a file to Solr to have its content extracted and added to the
|
||||
* index. commit() should be called once you're done ingesting files. If the
|
||||
* file is a directory or ingestContent is set to false, the file name is
|
||||
* indexed only.
|
||||
*
|
||||
* @param file File to ingest
|
||||
* @param ingestContent if true, index the file and the content, otherwise
|
||||
* indesx metadata only
|
||||
*
|
||||
* @throws IngesterException if there was an error processing a specific
|
||||
* file, but the Solr server is probably fine.
|
||||
* Visitor used to create fields to send to SOLR index.
|
||||
*/
|
||||
void ingest(AbstractFile file, boolean ingestContent) throws IngesterException {
|
||||
if (ingestContent == false || file.isDir()) {
|
||||
ingest(new NullContentStream(file), getContentFields(file), 0);
|
||||
} else {
|
||||
ingest(new FscContentStream(file), getContentFields(file), file.getSize());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a field map from FsContent, that is later sent to Solr
|
||||
*
|
||||
* @param fsc FsContent to get fields from
|
||||
*
|
||||
* @return the map
|
||||
*/
|
||||
private Map<String, String> getContentFields(AbstractContent fsc) {
|
||||
return fsc.accept(getContentFieldsV);
|
||||
}
|
||||
|
||||
/**
|
||||
* Visitor used to create param list to send to SOLR index.
|
||||
*/
|
||||
private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {
|
||||
static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
|
||||
|
||||
@Override
|
||||
protected Map<String, String> defaultVisit(Content cntnt) {
|
||||
protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> visit(File f) {
|
||||
Map<String, String> params = getCommonFields(f);
|
||||
getCommonFileContentFields(params, f);
|
||||
return params;
|
||||
return getCommonAndMACTimeFields(f);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> visit(DerivedFile df) {
|
||||
Map<String, String> params = getCommonFields(df);
|
||||
getCommonFileContentFields(params, df);
|
||||
return params;
|
||||
return getCommonAndMACTimeFields(df);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> visit(Directory d) {
|
||||
Map<String, String> params = getCommonFields(d);
|
||||
getCommonFileContentFields(params, d);
|
||||
return params;
|
||||
return getCommonAndMACTimeFields(d);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -211,19 +153,25 @@ class Ingester {
|
||||
|
||||
@Override
|
||||
public Map<String, String> visit(LocalFile lf) {
|
||||
Map<String, String> params = getCommonFields(lf);
|
||||
getCommonFileContentFields(params, lf);
|
||||
return params;
|
||||
return getCommonAndMACTimeFields(lf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> visit(SlackFile f) {
|
||||
Map<String, String> params = getCommonFields(f);
|
||||
getCommonFileContentFields(params, f);
|
||||
return params;
|
||||
return getCommonAndMACTimeFields(f);
|
||||
}
|
||||
|
||||
private Map<String, String> getCommonFileContentFields(Map<String, String> params, AbstractFile file) {
|
||||
/**
|
||||
* Get the field map for AbstractFiles that includes MAC times and the
|
||||
* fields that are common to all file classes.
|
||||
*
|
||||
* @param file The file to get fields for
|
||||
*
|
||||
* @return The field map, including MAC times and common fields, for the
|
||||
* give file.
|
||||
*/
|
||||
private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
|
||||
Map<String, String> params = getCommonFields(file);
|
||||
params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
|
||||
params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
|
||||
params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file));
|
||||
@ -231,140 +179,219 @@ class Ingester {
|
||||
return params;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the field map for AbstractFiles that is common to all file
|
||||
* classes
|
||||
*
|
||||
* @param file The file to get fields for
|
||||
*
|
||||
* @return The field map of fields that are common to all file classes.
|
||||
*/
|
||||
private Map<String, String> getCommonFields(AbstractFile af) {
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
|
||||
try {
|
||||
long dataSourceId = af.getDataSource().getId();
|
||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
|
||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(af.getDataSource().getId()));
|
||||
} catch (TskCoreException ex) {
|
||||
logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS
|
||||
logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + af.getId(), ex); //NON-NLS
|
||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
|
||||
}
|
||||
|
||||
params.put(Server.Schema.FILE_NAME.toString(), af.getName());
|
||||
return params;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the field map for artifacts.
|
||||
*
|
||||
* @param artifact The artifact to get fields for.
|
||||
*
|
||||
* @return The field map for the given artifact.
|
||||
*/
|
||||
@Override
|
||||
public Map<String, String> visit(BlackboardArtifact artifact) {
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
|
||||
try {
|
||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(ArtifactTextExtractor.getDataSource(artifact).getId()));
|
||||
} catch (TskCoreException ex) {
|
||||
logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
|
||||
params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
|
||||
}
|
||||
return params;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indexing method that bypasses Tika, assumes pure text It reads and
|
||||
* converts the entire content stream to string, assuming UTF8 since we
|
||||
* can't use streaming approach for Solr /update handler. This should be
|
||||
* safe, since all content is now in max 1MB chunks.
|
||||
* Use the given TextExtractor to extract text from the given source. The
|
||||
* text will be chunked and each chunk passed to Solr to add to the index.
|
||||
*
|
||||
*
|
||||
* @param <A> The type of the Appendix provider that provides
|
||||
* additional text to append to the final chunk.
|
||||
* @param <T> A subclass of SleuthkitVisibleItem.
|
||||
* @param extractor The TextExtractor that will be used to extract text from
|
||||
* the given source.
|
||||
* @param source The source from which text will be extracted, chunked,
|
||||
* and indexed.
|
||||
* @param context The ingest job context that can be used to cancel this
|
||||
* process.
|
||||
*
|
||||
* @return True if this method executed normally. or False if there was an
|
||||
* unexpected exception. //JMTODO: This policy needs to be reviewed.
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
< T extends SleuthkitVisitableItem> boolean indexText(TextExtractor< T> extractor, T source, IngestJobContext context) throws Ingester.IngesterException {
|
||||
final long sourceID = extractor.getID(source);
|
||||
final String sourceName = extractor.getName(source);
|
||||
|
||||
int numChunks = 0; //unknown until chunking is done
|
||||
|
||||
if (extractor.isDisabled()) {
|
||||
/* some Extrctors, notable the strings extractor, have options which
|
||||
* can be configured such that no extraction should be done */
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<String, String> fields = getContentFields(source);
|
||||
//Get a stream and a reader for that stream
|
||||
try (final InputStream stream = extractor.getInputStream(source);
|
||||
Reader reader = extractor.getReader(stream, source);) {
|
||||
|
||||
//we read max 1024 chars at time, this seems to max what some Readers would return
|
||||
char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
|
||||
boolean eof = false; //have we read until the end of the file yet
|
||||
while (!eof) {
|
||||
int chunkSizeInChars = 0; // the size in chars of the chunk (so far)
|
||||
if (context != null && context.fileIngestIsCancelled()) {
|
||||
return true;
|
||||
}
|
||||
long charsRead = 0; // number of chars read in the most recent read operation
|
||||
//consume bytes to fill entire chunk (but leave EXTRA_CHARS to end the word)
|
||||
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
|
||||
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, SINGLE_READ_CHARS)) != -1) {
|
||||
chunkSizeInChars += charsRead;
|
||||
}
|
||||
|
||||
if (charsRead == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
} else {
|
||||
chunkSizeInChars += charsRead;
|
||||
|
||||
//if we haven't reached the end of the file,
|
||||
//try to read char-by-char until whitespace to not break words
|
||||
while ((chunkSizeInChars < MAX_EXTR_TEXT_CHARS - 1)
|
||||
&& (Character.isWhitespace(textChunkBuf[chunkSizeInChars - 1]) == false)
|
||||
&& (charsRead = reader.read(textChunkBuf, chunkSizeInChars, 1)) != -1) {
|
||||
chunkSizeInChars += charsRead;
|
||||
}
|
||||
if (charsRead == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
}
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(chunkSizeInChars)
|
||||
.append(textChunkBuf, 0, chunkSizeInChars);
|
||||
|
||||
sanitizeToUTF8(sb); //replace non UTF8 chars with '^'
|
||||
|
||||
String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
|
||||
fields.put(Server.Schema.ID.toString(), chunkId);
|
||||
try {
|
||||
//pass the chunk to method that adds it to Solr index
|
||||
indexChunk(sb.toString(), sourceName, fields);
|
||||
numChunks++;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
extractor.logWarning("Ingester had a problem with extracted string from file '" //NON-NLS
|
||||
+ sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
|
||||
|
||||
throw ingEx; //need to rethrow to signal error and move on
|
||||
} catch (Exception ex) {
|
||||
throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex);
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} catch (Exception ex) {
|
||||
extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
|
||||
return false;
|
||||
} finally {
|
||||
//after all chunks, index just the meta data, including the numChunks, of the parent file
|
||||
fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
|
||||
fields.put(Server.Schema.ID.toString(), Long.toString(sourceID)); //reset id field to base document id
|
||||
indexChunk(null, sourceName, fields);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize the given StringBuilder by replacing non-UTF-8 characters with
|
||||
* caret '^'
|
||||
*
|
||||
* @param sb the StringBuilder to sanitize
|
||||
*
|
||||
* //JMTODO: use Charsequence.chars() or codePoints() and then a mapping
|
||||
* function?
|
||||
*/
|
||||
private static void sanitizeToUTF8(StringBuilder sb) {
|
||||
final int length = sb.length();
|
||||
|
||||
// Sanitize by replacing non-UTF-8 characters with caret '^'
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
|
||||
sb.replace(i, i + 1, "^");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add one chunk as to the Solr index as a seperate sold document.
|
||||
*
|
||||
* TODO see if can use a byte or string streaming way to add content to
|
||||
* /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
|
||||
* 4.0.0), see if possible to stream with UpdateRequestHandler
|
||||
*
|
||||
* @param cs
|
||||
* @param chunk The chunk content as a string
|
||||
* @param fields
|
||||
* @param size
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
void ingest(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
|
||||
private void indexChunk(String chunk, String sourceName, Map<String, String> fields) throws IngesterException {
|
||||
if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
|
||||
//JMTODO: actually if the we couldn't get the image id it is set to -1,
|
||||
// but does this really mean we don't want to index it?
|
||||
|
||||
//skip the file, image id unknown
|
||||
String msg = NbBundle.getMessage(this.getClass(),
|
||||
"Ingester.ingest.exception.unknownImgId.msg", cs.getName());
|
||||
//JMTODO: does this need to ne internationalized?
|
||||
String msg = NbBundle.getMessage(Ingester.class,
|
||||
"Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
|
||||
logger.log(Level.SEVERE, msg);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
|
||||
final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE];
|
||||
//Make a SolrInputDocument out of the field map
|
||||
SolrInputDocument updateDoc = new SolrInputDocument();
|
||||
|
||||
for (String key : fields.keySet()) {
|
||||
updateDoc.addField(key, fields.get(key));
|
||||
}
|
||||
|
||||
//using size here, but we are no longer ingesting entire files
|
||||
//size is normally a chunk size, up to 1MB
|
||||
if (size > 0) {
|
||||
// TODO (RC): Use try with resources, adjust exception messages
|
||||
InputStream is = null;
|
||||
int read = 0;
|
||||
try {
|
||||
is = cs.getStream();
|
||||
read = is.read(docChunkContentBuf);
|
||||
} catch (IOException ex) {
|
||||
throw new IngesterException(
|
||||
NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg",
|
||||
cs.getName()));
|
||||
} finally {
|
||||
if (null != is) {
|
||||
try {
|
||||
is.close();
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (read != 0) {
|
||||
String s = "";
|
||||
try {
|
||||
s = new String(docChunkContentBuf, 0, read, ENCODING);
|
||||
// Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index
|
||||
char[] chars = null;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
|
||||
// only convert string to char[] if there is a non-UTF8 character
|
||||
if (chars == null) {
|
||||
chars = s.toCharArray();
|
||||
}
|
||||
chars[i] = '^';
|
||||
}
|
||||
}
|
||||
// check if the string was modified (i.e. there was a non-UTF8 character found)
|
||||
if (chars != null) {
|
||||
s = new String(chars);
|
||||
}
|
||||
} catch (UnsupportedEncodingException ex) {
|
||||
logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS
|
||||
}
|
||||
updateDoc.addField(Server.Schema.CONTENT.toString(), s);
|
||||
} else {
|
||||
updateDoc.addField(Server.Schema.CONTENT.toString(), "");
|
||||
}
|
||||
} else {
|
||||
//no content, such as case when 0th chunk indexed
|
||||
updateDoc.addField(Server.Schema.CONTENT.toString(), "");
|
||||
}
|
||||
//add the content to the SolrInputDocument
|
||||
//JMTODO: can we just add it to the field map before passing that in?
|
||||
updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
|
||||
|
||||
try {
|
||||
//TODO consider timeout thread, or vary socket timeout based on size of indexed content
|
||||
//TODO: consider timeout thread, or vary socket timeout based on size of indexed content
|
||||
solrServer.addDocument(updateDoc);
|
||||
uncommitedIngests = true;
|
||||
} catch (KeywordSearchModuleException ex) {
|
||||
//JMTODO: does this need to ne internationalized?
|
||||
throw new IngesterException(
|
||||
NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex);
|
||||
NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* return timeout that should be used to index the content
|
||||
*
|
||||
* @param size size of the content
|
||||
*
|
||||
* @return time in seconds to use a timeout
|
||||
*/
|
||||
static int getTimeout(long size) {
|
||||
if (size < 1024 * 1024L) //1MB
|
||||
{
|
||||
return 60;
|
||||
} else if (size < 10 * 1024 * 1024L) //10MB
|
||||
{
|
||||
return 1200;
|
||||
} else if (size < 100 * 1024 * 1024L) //100MB
|
||||
{
|
||||
return 3600;
|
||||
} else {
|
||||
return 3 * 3600;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
@ -380,92 +407,6 @@ class Ingester {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ContentStream to read() the data from a FsContent object
|
||||
*/
|
||||
private static class FscContentStream implements ContentStream {
|
||||
|
||||
private AbstractFile f;
|
||||
|
||||
FscContentStream(AbstractFile f) {
|
||||
this.f = f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return f.getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceInfo() {
|
||||
return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getSize() {
|
||||
return f.getSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
return new ReadContentInputStream(f);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() throws IOException {
|
||||
throw new UnsupportedOperationException(
|
||||
NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ContentStream associated with FsContent, but forced with no content
|
||||
*/
|
||||
private static class NullContentStream implements ContentStream {
|
||||
|
||||
AbstractContent aContent;
|
||||
|
||||
NullContentStream(AbstractContent aContent) {
|
||||
this.aContent = aContent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return aContent.getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceInfo() {
|
||||
return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getSize() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
return new ByteArrayInputStream(new byte[0]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() throws IOException {
|
||||
throw new UnsupportedOperationException(
|
||||
NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates that there was an error with the specific ingest operation, but
|
||||
* it's still okay to continue ingesting files.
|
||||
|
@ -103,12 +103,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
||||
|
||||
private void reloadScriptsCheckBoxes() {
|
||||
boolean utf16
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
enableUTF16Checkbox.setSelected(utf16);
|
||||
|
||||
boolean utf8
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
enableUTF8Checkbox.setSelected(utf8);
|
||||
|
||||
final List<SCRIPT> serviceScripts = KeywordSearchSettings.getStringExtractScripts();
|
||||
@ -127,12 +127,12 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
||||
reloadScriptsCheckBoxes();
|
||||
|
||||
boolean utf16
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
enableUTF16Checkbox.setSelected(utf16);
|
||||
|
||||
boolean utf8
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
= Boolean.parseBoolean(KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
enableUTF8Checkbox.setSelected(utf8);
|
||||
final boolean extractEnabled = utf16 || utf8;
|
||||
|
||||
@ -257,9 +257,9 @@ class KeywordSearchGlobalLanguageSettingsPanel extends javax.swing.JPanel implem
|
||||
|
||||
@Override
|
||||
public void store() {
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
|
||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(),
|
||||
Boolean.toString(enableUTF8Checkbox.isSelected()));
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
|
||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(),
|
||||
Boolean.toString(enableUTF16Checkbox.isSelected()));
|
||||
|
||||
if (toUpdate != null) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2015 Basis Technology Corp.
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -89,7 +89,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
//accessed read-only by searcher thread
|
||||
|
||||
private boolean startedSearching = false;
|
||||
private List<TextExtractor> textExtractors;
|
||||
private List<FileTextExtractor> textExtractors;
|
||||
private StringsTextExtractor stringExtractor;
|
||||
private final KeywordSearchJobSettings settings;
|
||||
private boolean initialized = false;
|
||||
@ -415,24 +415,24 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
* @throws IngesterException exception thrown if indexing failed
|
||||
*/
|
||||
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
|
||||
TextExtractor fileExtract = null;
|
||||
FileTextExtractor extractor = null;
|
||||
|
||||
//go over available text extractors in order, and pick the first one (most specific one)
|
||||
for (TextExtractor fe : textExtractors) {
|
||||
for (FileTextExtractor fe : textExtractors) {
|
||||
if (fe.isSupported(aFile, detectedFormat)) {
|
||||
fileExtract = fe;
|
||||
extractor = fe;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (fileExtract == null) {
|
||||
if (extractor == null) {
|
||||
logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
|
||||
return false;
|
||||
}
|
||||
|
||||
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
|
||||
//divide into chunks and index
|
||||
return fileExtract.index(aFile, context);
|
||||
return Ingester.getDefault().indexText(extractor, aFile, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -448,7 +448,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return true;
|
||||
}
|
||||
if (stringExtractor.index(aFile, KeywordSearchIngestModule.this.context)) {
|
||||
if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
|
||||
return true;
|
||||
} else {
|
||||
@ -463,26 +463,6 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check with every extractor if it supports the file with the detected
|
||||
* format
|
||||
*
|
||||
* @param aFile file to check for
|
||||
* @param detectedFormat mime-type with detected format (such as
|
||||
* text/plain) or null if not detected
|
||||
*
|
||||
* @return true if text extraction is supported
|
||||
*/
|
||||
private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
|
||||
for (TextExtractor extractor : textExtractors) {
|
||||
if (extractor.isContentTypeSpecific() == true
|
||||
&& extractor.isSupported(aFile, detectedFormat)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the file to the index. Detects file type, calls extractors, etc.
|
||||
*
|
||||
@ -512,7 +492,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return;
|
||||
}
|
||||
ingester.ingest(aFile, false); //meta-data only
|
||||
ingester.indexMetaDataOnly(aFile);
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
|
||||
} catch (IngesterException ex) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
|
||||
@ -534,12 +514,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
||||
|
||||
// we skip archive formats that are opened by the archive module.
|
||||
// @@@ We could have a check here to see if the archive module was enabled though...
|
||||
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||
if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
|
||||
try {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
return;
|
||||
}
|
||||
ingester.ingest(aFile, false); //meta-data only
|
||||
ingester.indexMetaDataOnly(aFile);
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
|
||||
} catch (IngesterException ex) {
|
||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
|
||||
|
@ -101,8 +101,8 @@ public final class KeywordSearchJobSettingsPanel extends IngestModuleIngestJobSe
|
||||
}
|
||||
|
||||
private void displayEncodings() {
|
||||
String utf8 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
|
||||
String utf16 = KeywordSearchSettings.getStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
|
||||
String utf8 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString());
|
||||
String utf16 = KeywordSearchSettings.getStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString());
|
||||
ArrayList<String> encodingsList = new ArrayList<>();
|
||||
if (utf8 == null || Boolean.parseBoolean(utf8)) {
|
||||
encodingsList.add("UTF8");
|
||||
|
@ -23,7 +23,6 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.ModuleSettings;
|
||||
@ -211,14 +210,14 @@ class KeywordSearchSettings {
|
||||
KeywordSearchSettings.setUpdateFrequency(UpdateFrequency.DEFAULT);
|
||||
}
|
||||
//setting default Extract UTF8
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString())) {
|
||||
logger.log(Level.INFO, "No configuration for UTF8 found, generating default..."); //NON-NLS
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF8.toString(), Boolean.TRUE.toString());
|
||||
}
|
||||
//setting default Extract UTF16
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_OPTIONS, StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString())) {
|
||||
logger.log(Level.INFO, "No configuration for UTF16 found, generating defaults..."); //NON-NLS
|
||||
KeywordSearchSettings.setStringExtractOption(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
||||
KeywordSearchSettings.setStringExtractOption(StringsTextExtractor.ExtractOptions.EXTRACT_UTF16.toString(), Boolean.TRUE.toString());
|
||||
}
|
||||
//setting default Latin-1 Script
|
||||
if (!ModuleSettings.settingExists(KeywordSearchSettings.PROPERTIES_SCRIPTS, SCRIPT.LATIN_1.name())) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2015 Basis Technology Corp.
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -19,24 +19,16 @@
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.apache.solr.common.util.ContentStreamBase.StringStream;
|
||||
import org.openide.util.lookup.ServiceProvider;
|
||||
import org.sleuthkit.autopsy.casemodule.Case;
|
||||
import org.sleuthkit.autopsy.datamodel.ContentUtils;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.Content;
|
||||
import org.sleuthkit.datamodel.SleuthkitCase;
|
||||
import org.openide.util.NbBundle;
|
||||
import java.net.InetAddress;
|
||||
import java.util.MissingResourceException;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.openide.util.lookup.ServiceProvider;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
* An implementation of the KeywordSearchService interface that uses Solr for
|
||||
@ -49,6 +41,8 @@ public class SolrSearchService implements KeywordSearchService {
|
||||
private static final String SERVER_REFUSED_CONNECTION = "server refused connection"; //NON-NLS
|
||||
private static final int IS_REACHABLE_TIMEOUT_MS = 1000;
|
||||
|
||||
ArtifactTextExtractor extractor = new ArtifactTextExtractor();
|
||||
|
||||
@Override
|
||||
public void indexArtifact(BlackboardArtifact artifact) throws TskCoreException {
|
||||
if (artifact == null) {
|
||||
@ -57,109 +51,14 @@ public class SolrSearchService implements KeywordSearchService {
|
||||
|
||||
// We only support artifact indexing for Autopsy versions that use
|
||||
// the negative range for artifact ids.
|
||||
long artifactId = artifact.getArtifactID();
|
||||
|
||||
if (artifactId > 0) {
|
||||
if (artifact.getArtifactID() > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
Case currentCase;
|
||||
try {
|
||||
currentCase = Case.getCurrentCase();
|
||||
} catch (IllegalStateException ignore) {
|
||||
// thorown by Case.getCurrentCase() if currentCase is null
|
||||
return;
|
||||
}
|
||||
|
||||
SleuthkitCase sleuthkitCase = currentCase.getSleuthkitCase();
|
||||
if (sleuthkitCase == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
Content dataSource;
|
||||
AbstractFile abstractFile = sleuthkitCase.getAbstractFileById(artifact.getObjectID());
|
||||
if (abstractFile != null) {
|
||||
dataSource = abstractFile.getDataSource();
|
||||
} else {
|
||||
dataSource = sleuthkitCase.getContentById(artifact.getObjectID());
|
||||
}
|
||||
|
||||
if (dataSource == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Concatenate the string values of all attributes into a single
|
||||
// "content" string to be indexed.
|
||||
StringBuilder artifactContents = new StringBuilder();
|
||||
|
||||
for (BlackboardAttribute attribute : artifact.getAttributes()) {
|
||||
artifactContents.append(attribute.getAttributeType().getDisplayName());
|
||||
artifactContents.append(" : ");
|
||||
|
||||
// This is ugly since it will need to updated any time a new
|
||||
// TSK_DATETIME_* attribute is added. A slightly less ugly
|
||||
// alternative would be to assume that all date time attributes
|
||||
// will have a name of the form "TSK_DATETIME*" and check
|
||||
// attribute.getAttributeTypeName().startsWith("TSK_DATETIME*".
|
||||
// The major problem with that approach is that it would require
|
||||
// a round trip to the database to get the type name string.
|
||||
// We have also discussed modifying BlackboardAttribute.getDisplayString()
|
||||
// to magically format datetime attributes but that is complicated by
|
||||
// the fact that BlackboardAttribute exists in Sleuthkit data model
|
||||
// while the utility to determine the timezone to use is in ContentUtils
|
||||
// in the Autopsy datamodel.
|
||||
if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_RCVD.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_SENT.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_START.getTypeID()
|
||||
|| attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_END.getTypeID()) {
|
||||
|
||||
artifactContents.append(ContentUtils.getStringTime(attribute.getValueLong(), dataSource));
|
||||
} else {
|
||||
artifactContents.append(attribute.getDisplayString());
|
||||
}
|
||||
artifactContents.append(System.lineSeparator());
|
||||
}
|
||||
|
||||
if (artifactContents.length() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// To play by the rules of the existing text markup implementations,
|
||||
// we need to (a) index the artifact contents in a "chunk" and
|
||||
// (b) create a separate index entry for the base artifact.
|
||||
// We distinguish artifact content from file content by applying a
|
||||
// mask to the artifact id to make its value > 0x8000000000000000 (i.e. negative).
|
||||
// First, create an index entry for the base artifact.
|
||||
HashMap<String, String> solrFields = new HashMap<>();
|
||||
String documentId = Long.toString(artifactId);
|
||||
|
||||
solrFields.put(Server.Schema.ID.toString(), documentId);
|
||||
|
||||
// Set the IMAGE_ID field.
|
||||
solrFields.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
|
||||
final Ingester ingester = Ingester.getDefault();
|
||||
|
||||
try {
|
||||
Ingester.getDefault().ingest(new StringStream(""), solrFields, 0);
|
||||
} catch (Ingester.IngesterException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
|
||||
// Next create the index entry for the document content.
|
||||
// The content gets added to a single chunk. We may need to add chunking
|
||||
// support later.
|
||||
long chunkId = 1;
|
||||
|
||||
documentId += "_" + Long.toString(chunkId);
|
||||
solrFields.replace(Server.Schema.ID.toString(), documentId);
|
||||
|
||||
StringStream contentStream = new StringStream(artifactContents.toString());
|
||||
|
||||
try {
|
||||
Ingester.getDefault().ingest(contentStream, solrFields, contentStream.getSize());
|
||||
ingester.indexMetaDataOnly(artifact);
|
||||
ingester.indexText(extractor, artifact, null);
|
||||
} catch (Ingester.IngesterException ex) {
|
||||
throw new TskCoreException(ex.getCause().getMessage(), ex);
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2011-2014 Basis Technology Corp.
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -20,155 +20,106 @@ package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
import org.sleuthkit.datamodel.TskException;
|
||||
|
||||
/**
|
||||
* Takes an AbstractFile, extract strings, converts into chunks (associated with
|
||||
* the original source file) up to 1MB then and indexes chunks as text with Solr
|
||||
* Extracts raw strings from AbstractFile content.
|
||||
*/
|
||||
class StringsTextExtractor implements TextExtractor {
|
||||
class StringsTextExtractor extends FileTextExtractor {
|
||||
|
||||
private static Ingester ingester;
|
||||
private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
|
||||
private static final long MAX_STRING_CHUNK_SIZE = 1 * 31 * 1024L;
|
||||
//private static final int BOM_LEN = 3;
|
||||
private static final int BOM_LEN = 0; //disabled prepending of BOM
|
||||
private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
|
||||
private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
|
||||
private AbstractFile sourceFile;
|
||||
private int numChunks = 0;
|
||||
/**
|
||||
* Options for this extractor
|
||||
*/
|
||||
enum ExtractOptions {
|
||||
EXTRACT_UTF16, ///< extract UTF16 text, true/false
|
||||
EXTRACT_UTF8, ///< extract UTF8 text, true/false
|
||||
};
|
||||
|
||||
private final List<SCRIPT> extractScripts = new ArrayList<>();
|
||||
private Map<String, String> extractOptions = new HashMap<>();
|
||||
|
||||
//disabled prepending of BOM
|
||||
//static {
|
||||
//prepend UTF-8 BOM to start of the buffer
|
||||
//stringChunkBuf[0] = (byte) 0xEF;
|
||||
//stringChunkBuf[1] = (byte) 0xBB;
|
||||
//stringChunkBuf[2] = (byte) 0xBF;
|
||||
//}
|
||||
public StringsTextExtractor() {
|
||||
ingester = Ingester.getDefault();
|
||||
extractScripts.add(DEFAULT_SCRIPT);
|
||||
//LATIN_2 is the default script
|
||||
extractScripts.add(SCRIPT.LATIN_2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setScripts(List<SCRIPT> extractScripts) {
|
||||
/**
|
||||
* Sets the scripts to use for the extraction
|
||||
*
|
||||
* @param extractScripts scripts to use
|
||||
*/
|
||||
public void setScripts(List<SCRIPT> extractScripts) {
|
||||
this.extractScripts.clear();
|
||||
this.extractScripts.addAll(extractScripts);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
/**
|
||||
* Get the currently used scripts for extraction
|
||||
*
|
||||
* @return scripts currently used or null if not supported
|
||||
*/
|
||||
public List<SCRIPT> getScripts() {
|
||||
return new ArrayList<>(extractScripts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumChunks() {
|
||||
return this.numChunks;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractFile getSourceFile() {
|
||||
return sourceFile;
|
||||
}
|
||||
|
||||
@Override
|
||||
/**
|
||||
* Get current options
|
||||
*
|
||||
* @return currently used, extractor specific options, or null of not
|
||||
* supported
|
||||
*/
|
||||
public Map<String, String> getOptions() {
|
||||
return extractOptions;
|
||||
}
|
||||
|
||||
@Override
|
||||
/**
|
||||
* Set extractor specific options
|
||||
*
|
||||
* @param options options to use
|
||||
*/
|
||||
public void setOptions(Map<String, String> options) {
|
||||
this.extractOptions = options;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
this.numChunks = 0; //unknown until indexing is done
|
||||
boolean success = false;
|
||||
boolean isDisabled() {
|
||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
final boolean extractUTF8
|
||||
= Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
return extractUTF8 == false && extractUTF16 == false;
|
||||
}
|
||||
|
||||
final boolean extractUTF16
|
||||
= Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
@Override
|
||||
InputStreamReader getReader(final InputStream stringStream, AbstractFile sourceFile) throws Ingester.IngesterException {
|
||||
return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||
}
|
||||
|
||||
if (extractUTF8 == false && extractUTF16 == false) {
|
||||
//nothing to do
|
||||
return true;
|
||||
}
|
||||
|
||||
InputStream stringStream;
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile) {
|
||||
//check which extract stream to use
|
||||
if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
|
||||
//optimal for english, english only
|
||||
stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
|
||||
return new EnglishOnlyStream(sourceFile);//optimal for english, english only
|
||||
} else {
|
||||
stringStream = new AbstractFileStringIntStream(
|
||||
sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
|
||||
boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
|
||||
boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
|
||||
|
||||
return new InternationalStream(sourceFile, extractScripts, extractUTF8, extractUTF16);
|
||||
}
|
||||
|
||||
try {
|
||||
success = true;
|
||||
//break input stream into chunks
|
||||
|
||||
final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
|
||||
long readSize;
|
||||
while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
ingester.ingest(this);
|
||||
return true;
|
||||
}
|
||||
//FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
|
||||
//debug.write(stringChunkBuf, 0, (int)readSize);
|
||||
|
||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||
|
||||
try {
|
||||
chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
|
||||
++this.numChunks;
|
||||
} catch (IngesterException ingEx) {
|
||||
success = false;
|
||||
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
|
||||
throw ingEx; //need to rethrow/return to signal error and move on
|
||||
}
|
||||
|
||||
//debug.close();
|
||||
}
|
||||
|
||||
//after all chunks, ingest the parent file without content itself, and store numChunks
|
||||
ingester.ingest(this);
|
||||
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
|
||||
success = false;
|
||||
} finally {
|
||||
try {
|
||||
stringStream.close();
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
|
||||
}
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isContentTypeSpecific() {
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -176,4 +127,379 @@ class StringsTextExtractor implements TextExtractor {
|
||||
// strings can be run on anything.
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* AbstractFile input string stream reader/converter - given AbstractFile,
|
||||
* extract strings from it and return encoded bytes via read()
|
||||
*
|
||||
* Note: the utility supports extraction of only LATIN script and UTF8,
|
||||
* UTF16LE, UTF16BE encodings and uses a brute force encoding detection -
|
||||
* it's fast but could apply multiple encodings on the same string.
|
||||
*
|
||||
* For other script/languages support and better encoding detection use
|
||||
* AbstractFileStringIntStream streaming class, which wraps around
|
||||
* StringExtract extractor.
|
||||
*/
|
||||
private static class EnglishOnlyStream extends InputStream {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
|
||||
private static final String NLS = Character.toString((char) 10); //new line
|
||||
private static final int READ_BUF_SIZE = 256;
|
||||
private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
|
||||
|
||||
//args
|
||||
private final AbstractFile content;
|
||||
|
||||
//internal working data
|
||||
private long contentOffset = 0; //offset in fscontent read into curReadBuf
|
||||
private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
|
||||
private int bytesInReadBuf = 0;
|
||||
private int readBufOffset = 0; //offset in read buf processed
|
||||
private StringBuilder curString = new StringBuilder();
|
||||
private int curStringLen = 0;
|
||||
private StringBuilder tempString = new StringBuilder();
|
||||
private int tempStringLen = 0;
|
||||
private boolean isEOF = false;
|
||||
private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
|
||||
private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
|
||||
private boolean inString = false; //if current temp has min chars required
|
||||
private final byte[] oneCharBuf = new byte[1];
|
||||
|
||||
/**
|
||||
* Construct new string stream from FsContent Do not attempt to fill
|
||||
* entire read buffer if that would break a string
|
||||
*
|
||||
* @param content to extract strings from
|
||||
* @param outputCharset target charset to encode into bytes and index
|
||||
* as, e.g. UTF-8
|
||||
*
|
||||
*/
|
||||
private EnglishOnlyStream(AbstractFile content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
if (b == null) {
|
||||
throw new NullPointerException();
|
||||
} else if (off < 0 || len < 0 || len > b.length - off) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
} else if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
long fileSize = content.getSize();
|
||||
if (fileSize == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (isEOF) {
|
||||
return -1;
|
||||
}
|
||||
if (stringAtTempBoundary) {
|
||||
//append entire temp string residual from previous read()
|
||||
//because qualified string was broken down into 2 parts
|
||||
appendResetTemp();
|
||||
stringAtTempBoundary = false;
|
||||
//there could be more to this string in fscontent/buffer
|
||||
}
|
||||
boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
|
||||
int newCurLen = curStringLen + tempStringLen;
|
||||
while (newCurLen < len) {
|
||||
//need to extract more strings
|
||||
if (readBufOffset > bytesInReadBuf - 1) {
|
||||
//no more bytes to process into strings, read them
|
||||
try {
|
||||
bytesInReadBuf = 0;
|
||||
bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
|
||||
} catch (TskException ex) {
|
||||
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
appendResetTemp();
|
||||
//have some extracted string, return that, and fail next time
|
||||
isEOF = true;
|
||||
int copied = copyToReturn(b, off, len);
|
||||
return copied;
|
||||
} else {
|
||||
return -1; //EOF
|
||||
}
|
||||
}
|
||||
if (bytesInReadBuf < 1) {
|
||||
if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
appendResetTemp();
|
||||
//have some extracted string, return that, and fail next time
|
||||
isEOF = true;
|
||||
int copied = copyToReturn(b, off, len);
|
||||
return copied;
|
||||
} else {
|
||||
return -1; //EOF
|
||||
}
|
||||
}
|
||||
//increment content offset for next read
|
||||
contentOffset += bytesInReadBuf;
|
||||
//reset read buf position
|
||||
readBufOffset = 0;
|
||||
}
|
||||
//get char from cur read buf
|
||||
char c = (char) curReadBuf[readBufOffset++];
|
||||
if (c == 0 && singleConsecZero == false) {
|
||||
//preserve the current sequence if max consec. 1 zero char
|
||||
singleConsecZero = true;
|
||||
} else {
|
||||
singleConsecZero = false;
|
||||
}
|
||||
if (StringExtract.isPrintableAscii(c)) {
|
||||
tempString.append(c);
|
||||
++tempStringLen;
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
inString = true;
|
||||
}
|
||||
//boundary case when temp has still chars - handled after the loop
|
||||
} else if (!singleConsecZero) {
|
||||
//break the string, clear temp
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
|
||||
//append entire temp string with new line
|
||||
tempString.append(NLS);
|
||||
++tempStringLen;
|
||||
curString.append(tempString);
|
||||
curStringLen += tempStringLen;
|
||||
stringAtBufBoundary = false;
|
||||
}
|
||||
//reset temp
|
||||
tempString = new StringBuilder();
|
||||
tempStringLen = 0;
|
||||
}
|
||||
newCurLen = curStringLen + tempStringLen;
|
||||
}
|
||||
//check if still in string state, so that next chars in read buf bypass min chars check
|
||||
//and qualify as string even if less < min chars required
|
||||
if (inString) {
|
||||
inString = false; //reset
|
||||
stringAtBufBoundary = true; //will bypass the check
|
||||
}
|
||||
//check if temp still has chars to qualify as a string
|
||||
//we might need to break up temp into 2 parts for next read() call
|
||||
//consume as many as possible to fill entire user buffer
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
if (newCurLen > len) {
|
||||
int appendChars = len - curStringLen;
|
||||
//save part for next user read(), need to break up temp string
|
||||
//do not append new line
|
||||
String toAppend = tempString.substring(0, appendChars);
|
||||
String newTemp = tempString.substring(appendChars);
|
||||
curString.append(toAppend);
|
||||
curStringLen += appendChars;
|
||||
tempString = new StringBuilder(newTemp);
|
||||
tempStringLen = newTemp.length();
|
||||
stringAtTempBoundary = true;
|
||||
} else {
|
||||
//append entire temp
|
||||
curString.append(tempString);
|
||||
curStringLen += tempStringLen;
|
||||
//reset temp
|
||||
tempString = new StringBuilder();
|
||||
tempStringLen = 0;
|
||||
}
|
||||
} else {
|
||||
//if temp has a few chars, not qualified as string for now,
|
||||
//will be processed during next read() call
|
||||
}
|
||||
//copy current strings to user
|
||||
final int copied = copyToReturn(b, off, len);
|
||||
//there may be still chars in read buffer or tempString, for next read()
|
||||
return copied;
|
||||
}
|
||||
|
||||
//append temp buffer to cur string buffer and reset temp, if enough chars
|
||||
//does not append new line
|
||||
private void appendResetTemp() {
|
||||
if (tempStringLen >= MIN_PRINTABLE_CHARS) {
|
||||
curString.append(tempString);
|
||||
curStringLen += tempStringLen;
|
||||
tempString = new StringBuilder();
|
||||
tempStringLen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//copy currently extracted string to user buffer
|
||||
//and reset for next read() call
|
||||
private int copyToReturn(byte[] b, int off, long len) {
|
||||
final String curStringS = curString.toString();
|
||||
//logger.log(Level.INFO, curStringS);
|
||||
byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||
System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
|
||||
//logger.log(Level.INFO, curStringS);
|
||||
//copied all string, reset
|
||||
curString = new StringBuilder();
|
||||
int ret = curStringLen;
|
||||
curStringLen = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
final int read = read(oneCharBuf, 0, 1);
|
||||
if (read == 1) {
|
||||
return oneCharBuf[0];
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int available() throws IOException {
|
||||
//we don't know how many bytes in curReadBuf may end up as strings
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
//use default implementation that reads into skip buffer
|
||||
//but it could be more efficient
|
||||
return super.skip(n);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper over StringExtract to provide streaming API Given AbstractFile
|
||||
* object, extract international strings from the file and read output as a
|
||||
* stream of UTF-8 strings as encoded bytes.
|
||||
*
|
||||
*/
|
||||
private static class InternationalStream extends InputStream {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
|
||||
private static final int FILE_BUF_SIZE = 1024 * 1024;
|
||||
private final AbstractFile content;
|
||||
private final byte[] oneCharBuf = new byte[1];
|
||||
private final StringExtract stringExtractor;
|
||||
/** true if there is nothing to do because neither extractUTF8 nor
|
||||
* extractUTF16 was true in constructor */
|
||||
private final boolean nothingToDo;
|
||||
private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
|
||||
private long fileReadOffset = 0L;
|
||||
private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
|
||||
private int convertBuffOffset = 0; //offset to start returning data to user on next read()
|
||||
private int bytesInConvertBuff = 0; //amount of data currently in the buffer
|
||||
private boolean fileEOF = false; //if file has more bytes to read
|
||||
private StringExtract.StringExtractResult lastExtractResult;
|
||||
|
||||
/**
|
||||
* Constructs new stream object that does conversion from file, to
|
||||
* extracted strings, then to byte stream, for specified script,
|
||||
* auto-detected encoding (UTF8, UTF16LE, UTF16BE), and specified output
|
||||
* byte stream encoding
|
||||
*
|
||||
* @param content input content to process and turn into a stream
|
||||
* to convert into strings
|
||||
* @param scripts a list of scripts to consider
|
||||
* @param extractUTF8 whether to extract utf8 encoding
|
||||
* @param extractUTF16 whether to extract utf16 encoding
|
||||
*/
|
||||
private InternationalStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
|
||||
this.content = content;
|
||||
this.stringExtractor = new StringExtract();
|
||||
this.stringExtractor.setEnabledScripts(scripts);
|
||||
this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
|
||||
this.stringExtractor.setEnableUTF8(extractUTF8);
|
||||
this.stringExtractor.setEnableUTF16(extractUTF16);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
if (nothingToDo) {
|
||||
return -1;
|
||||
}
|
||||
final int read = read(oneCharBuf, 0, 1);
|
||||
if (read == 1) {
|
||||
return oneCharBuf[0];
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
if (b == null) {
|
||||
throw new NullPointerException();
|
||||
} else if (off < 0 || len < 0 || len > b.length - off) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
} else if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
if (nothingToDo) {
|
||||
return -1;
|
||||
}
|
||||
long fileSize = content.getSize();
|
||||
if (fileSize == 0) {
|
||||
return -1;
|
||||
}
|
||||
//read and convert until user buffer full
|
||||
//we have data if file can be read or when byteBuff has converted strings to return
|
||||
int bytesToUser = 0; //returned to user so far
|
||||
int offsetUser = off;
|
||||
while (bytesToUser < len && offsetUser < len) {
|
||||
//check if we have enough converted strings
|
||||
int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
|
||||
if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
|
||||
try {
|
||||
//convert more strings, store in buffer
|
||||
long toRead = 0;
|
||||
|
||||
//fill up entire fileReadBuff fresh
|
||||
toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
|
||||
//}
|
||||
int read = content.read(fileReadBuff, fileReadOffset, toRead);
|
||||
if (read == -1 || read == 0) {
|
||||
fileEOF = true;
|
||||
} else {
|
||||
fileReadOffset += read;
|
||||
if (fileReadOffset >= fileSize) {
|
||||
fileEOF = true;
|
||||
}
|
||||
//put converted string in convertBuff
|
||||
convert(read);
|
||||
convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
|
||||
}
|
||||
} catch (TskCoreException ex) {
|
||||
//Exceptions.printStackTrace(ex);
|
||||
fileEOF = true;
|
||||
}
|
||||
}
|
||||
//nothing more to read, and no more bytes in convertBuff
|
||||
if (convertBuff == null || convertBuffRemain == 0) {
|
||||
if (fileEOF) {
|
||||
return bytesToUser > 0 ? bytesToUser : -1;
|
||||
} else {
|
||||
//no strings extracted, try another read
|
||||
continue;
|
||||
}
|
||||
}
|
||||
//return part or all of convert buff to user
|
||||
final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
|
||||
System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
|
||||
|
||||
convertBuffOffset += toCopy;
|
||||
offsetUser += toCopy;
|
||||
bytesToUser += toCopy;
|
||||
}
|
||||
//if more string data in convertBuff, will be consumed on next read()
|
||||
return bytesToUser;
|
||||
}
|
||||
|
||||
/**
|
||||
* convert bytes in file buffer to string, and encode string in
|
||||
* convertBuffer
|
||||
*
|
||||
* @param numBytes num bytes in the fileReadBuff
|
||||
*/
|
||||
private void convert(int numBytes) {
|
||||
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
|
||||
convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
|
||||
//reset tracking vars
|
||||
if (lastExtractResult.getNumBytes() == 0) {
|
||||
bytesInConvertBuff = 0;
|
||||
} else {
|
||||
bytesInConvertBuff = convertBuff.length;
|
||||
}
|
||||
convertBuffOffset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2012 Basis Technology Corp.
|
||||
* Copyright 2011-16 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -18,140 +18,76 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.SleuthkitVisitableItem;
|
||||
|
||||
/**
|
||||
* Common methods for utilities that extract text and content and divide into
|
||||
* chunks
|
||||
* Extracts text out of a SleuthkitVisitableItem, and exposes it is a Reader.
|
||||
* This Reader is given to the Ingester to chunk and index in Solr.
|
||||
*
|
||||
* @param <TextSource> The subtype of SleuthkitVisitableItem an implementation
|
||||
* is able to process.
|
||||
*/
|
||||
interface TextExtractor {
|
||||
abstract class TextExtractor< TextSource extends SleuthkitVisitableItem> {
|
||||
|
||||
static final private Logger logger = Logger.getLogger(TextExtractor.class.getName());
|
||||
|
||||
/**
|
||||
* Common options that can be used by some extractors
|
||||
* Is this extractor configured such that no extraction will/should be done?
|
||||
*
|
||||
* @return True if this extractor will/should not perform any extraction.
|
||||
*/
|
||||
enum ExtractOptions {
|
||||
|
||||
EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||
EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
|
||||
};
|
||||
|
||||
//generally text extractors should ignore archives
|
||||
//and let unpacking modules take case of them
|
||||
static final List<String> ARCHIVE_MIME_TYPES
|
||||
= Arrays.asList(
|
||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||
"application/x-7z-compressed", //NON-NLS
|
||||
"application/x-ace-compressed", //NON-NLS
|
||||
"application/x-alz-compressed", //NON-NLS
|
||||
"application/x-arj", //NON-NLS
|
||||
"application/vnd.ms-cab-compressed", //NON-NLS
|
||||
"application/x-cfs-compressed", //NON-NLS
|
||||
"application/x-dgc-compressed", //NON-NLS
|
||||
"application/x-apple-diskimage", //NON-NLS
|
||||
"application/x-gca-compressed", //NON-NLS
|
||||
"application/x-dar", //NON-NLS
|
||||
"application/x-lzx", //NON-NLS
|
||||
"application/x-lzh", //NON-NLS
|
||||
"application/x-rar-compressed", //NON-NLS
|
||||
"application/x-stuffit", //NON-NLS
|
||||
"application/x-stuffitx", //NON-NLS
|
||||
"application/x-gtar", //NON-NLS
|
||||
"application/x-archive", //NON-NLS
|
||||
"application/x-executable", //NON-NLS
|
||||
"application/x-gzip", //NON-NLS
|
||||
"application/zip", //NON-NLS
|
||||
"application/x-zoo", //NON-NLS
|
||||
"application/x-cpio", //NON-NLS
|
||||
"application/x-shar", //NON-NLS
|
||||
"application/x-tar", //NON-NLS
|
||||
"application/x-bzip", //NON-NLS
|
||||
"application/x-bzip2", //NON-NLS
|
||||
"application/x-lzip", //NON-NLS
|
||||
"application/x-lzma", //NON-NLS
|
||||
"application/x-lzop", //NON-NLS
|
||||
"application/x-z", //NON-NLS
|
||||
"application/x-compress"); //NON-NLS
|
||||
abstract boolean isDisabled();
|
||||
|
||||
/**
|
||||
* Get number of chunks resulted from extracting this AbstractFile
|
||||
* Log the given message and exception as a warning.
|
||||
*
|
||||
* @return the number of chunks produced
|
||||
* @param msg
|
||||
* @param ex
|
||||
*/
|
||||
int getNumChunks();
|
||||
void logWarning(String msg, Exception ex) {
|
||||
logger.log(Level.WARNING, msg, ex); //NON-NLS }
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the source file associated with this extraction
|
||||
* Get an input stream over the content of the given source.
|
||||
*
|
||||
* @return the source AbstractFile
|
||||
*/
|
||||
AbstractFile getSourceFile();
|
||||
|
||||
/**
|
||||
* Index the Abstract File
|
||||
*
|
||||
* @param sourceFile file to index
|
||||
*
|
||||
* @return true if indexed successfully, false otherwise
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;
|
||||
|
||||
/**
|
||||
* Sets the scripts to use for the extraction
|
||||
*
|
||||
* @param extractScripts scripts to use
|
||||
*
|
||||
* @return true if extractor supports script - specific extraction, false
|
||||
* otherwise
|
||||
*/
|
||||
boolean setScripts(List<SCRIPT> extractScript);
|
||||
|
||||
/**
|
||||
* Get the currently used scripts for extraction
|
||||
*
|
||||
* @return scripts currently used or null if not supported
|
||||
*/
|
||||
List<SCRIPT> getScripts();
|
||||
|
||||
/**
|
||||
* Get current options
|
||||
*
|
||||
* @return currently used, extractor specific options, or null of not
|
||||
* supported
|
||||
*/
|
||||
Map<String, String> getOptions();
|
||||
|
||||
/**
|
||||
* Set extractor specific options
|
||||
*
|
||||
* @param options options to use
|
||||
*/
|
||||
void setOptions(Map<String, String> options);
|
||||
|
||||
/**
|
||||
* Determines if the extractor works only for specified types is
|
||||
* supportedTypes() or whether is a generic content extractor (such as
|
||||
* string extractor)
|
||||
* @param source
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
boolean isContentTypeSpecific();
|
||||
abstract InputStream getInputStream(TextSource source);
|
||||
|
||||
/**
|
||||
* Determines if the file content is supported by the extractor if
|
||||
* isContentTypeSpecific() returns true.
|
||||
* Get a reader that over the text extracted from the given source.
|
||||
*
|
||||
* @param file to test if its content should be supported
|
||||
* @param detectedFormat mime-type with detected format (such as text/plain)
|
||||
* or null if not detected
|
||||
* @param stream
|
||||
* @param source
|
||||
*
|
||||
* @return true if the file content is supported, false otherwise
|
||||
* @return
|
||||
*
|
||||
* @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
|
||||
*/
|
||||
boolean isSupported(AbstractFile file, String detectedFormat);
|
||||
abstract Reader getReader(InputStream stream, TextSource source) throws Ingester.IngesterException;
|
||||
|
||||
/**
|
||||
* Get the 'object' id of the given source.
|
||||
*
|
||||
* @param source
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
abstract long getID(TextSource source);
|
||||
|
||||
/**
|
||||
* Get a human readable name for the given source.
|
||||
*
|
||||
* @param source
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
abstract String getName(TextSource source);
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Autopsy Forensic Browser
|
||||
*
|
||||
* Copyright 2012-2013 Basis Technology Corp.
|
||||
* Copyright 2011-2016 Basis Technology Corp.
|
||||
* Contact: carrier <at> sleuthkit <dot> org
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -18,244 +18,85 @@
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import com.google.common.io.CharSource;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.MissingResourceException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.sleuthkit.autopsy.coreutils.TextUtil;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.logging.Level;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.openide.util.NbBundle;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.ingest.IngestJobContext;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
|
||||
/**
|
||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||
* divided into chunks and indexed with Solr. Protects against Tika parser hangs
|
||||
* (for unexpected/corrupt content) using a timeout mechanism. If Tika
|
||||
* extraction succeeds, chunks are indexed with Solr.
|
||||
*
|
||||
* This Tika extraction/chunking utility is useful for large files of Tika
|
||||
* parsers-supported content type.
|
||||
*
|
||||
* Extracts text from Tika supported AbstractFile content. Protects against Tika
|
||||
* parser hangs (for unexpected/corrupt content) using a timeout mechanism.
|
||||
*/
|
||||
class TikaTextExtractor implements TextExtractor {
|
||||
class TikaTextExtractor extends FileTextExtractor {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
|
||||
private static Ingester ingester;
|
||||
private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
|
||||
private static final int MAX_EXTR_TEXT_CHARS = 16 * 1024;
|
||||
private static final int SINGLE_READ_CHARS = 1024;
|
||||
private static final int EXTRA_CHARS = 128; //for whitespace
|
||||
private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
|
||||
private AbstractFile sourceFile; //currently processed file
|
||||
private int numChunks = 0;
|
||||
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
|
||||
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
|
||||
|
||||
TikaTextExtractor() {
|
||||
ingester = Ingester.getDefault();
|
||||
private static final List<String> TIKA_SUPPORTED_TYPES
|
||||
= new Tika().getParser().getSupportedTypes(new ParseContext())
|
||||
.parallelStream()
|
||||
.map(mt -> mt.getType() + "/" + mt.getSubtype())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
|
||||
for (MediaType mt : mediaTypes) {
|
||||
TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
|
||||
}
|
||||
//logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
|
||||
@Override
|
||||
void logWarning(final String msg, Exception ex) {
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
||||
super.logWarning(msg, ex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getOptions() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOptions(Map<String, String> options) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumChunks() {
|
||||
return numChunks;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractFile getSourceFile() {
|
||||
return sourceFile;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
|
||||
this.sourceFile = sourceFile;
|
||||
numChunks = 0; //unknown until indexing is done
|
||||
|
||||
boolean success = false;
|
||||
Reader reader = null;
|
||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||
Reader getReader(final InputStream stream, AbstractFile sourceFile) throws IngesterException, MissingResourceException {
|
||||
Metadata metadata = new Metadata();
|
||||
//Parse the file in a task, a convenient way to have a timeout...
|
||||
final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
|
||||
try {
|
||||
Metadata meta = new Metadata();
|
||||
|
||||
//Parse the file in a task
|
||||
Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
|
||||
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
||||
final Future<?> future = tikaParseExecutor.submit(parseTask);
|
||||
try {
|
||||
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||
} catch (TimeoutException te) {
|
||||
final String msg = NbBundle.getMessage(this.getClass(),
|
||||
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
|
||||
sourceFile.getId(), sourceFile.getName());
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
|
||||
logger.log(Level.WARNING, msg);
|
||||
throw new IngesterException(msg);
|
||||
} catch (Exception ex) {
|
||||
final String msg = NbBundle.getMessage(this.getClass(),
|
||||
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
|
||||
sourceFile.getId(), sourceFile.getName());
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
||||
logger.log(Level.WARNING, msg);
|
||||
throw new IngesterException(msg);
|
||||
}
|
||||
|
||||
// get the reader with the results
|
||||
reader = parseTask.getReader();
|
||||
if (reader == null) {
|
||||
//likely due to exception in parse()
|
||||
logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
|
||||
return false;
|
||||
}
|
||||
|
||||
// break the results into chunks and index
|
||||
success = true;
|
||||
long readSize;
|
||||
long totalRead = 0;
|
||||
boolean eof = false;
|
||||
//we read max 1024 chars at time, this seems to max what this Reader would return
|
||||
while (!eof) {
|
||||
if (context.fileIngestIsCancelled()) {
|
||||
ingester.ingest(this);
|
||||
return true;
|
||||
}
|
||||
readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
|
||||
if (readSize == -1) {
|
||||
eof = true;
|
||||
} else {
|
||||
totalRead += readSize;
|
||||
}
|
||||
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
|
||||
while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
|
||||
&& (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
|
||||
totalRead += readSize;
|
||||
}
|
||||
if (readSize == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
} else {
|
||||
//try to read char-by-char until whitespace to not break words
|
||||
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
|
||||
&& !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
|
||||
&& (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
|
||||
totalRead += readSize;
|
||||
}
|
||||
if (readSize == -1) {
|
||||
//this is the last chunk
|
||||
eof = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Sanitize by replacing non-UTF-8 characters with caret '^'
|
||||
for (int i = 0; i < totalRead; ++i) {
|
||||
if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
|
||||
textChunkBuf[i] = '^';
|
||||
}
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
|
||||
sb.append(textChunkBuf, 0, (int) totalRead);
|
||||
|
||||
//reset for next chunk
|
||||
totalRead = 0;
|
||||
|
||||
//append meta data if last chunk
|
||||
if (eof) {
|
||||
//sort meta data keys
|
||||
List<String> sortedKeyList = Arrays.asList(meta.names());
|
||||
Collections.sort(sortedKeyList);
|
||||
sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
|
||||
for (String key : sortedKeyList) {
|
||||
String value = meta.get(key);
|
||||
sb.append(key).append(": ").append(value).append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Encode from UTF-8 charset to bytes
|
||||
byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
|
||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||
try {
|
||||
chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
|
||||
++this.numChunks;
|
||||
} catch (Ingester.IngesterException ingEx) {
|
||||
success = false;
|
||||
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
|
||||
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
|
||||
throw ingEx; //need to rethrow/return to signal error and move on
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
||||
logger.log(Level.WARNING, msg);
|
||||
success = false;
|
||||
final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
|
||||
CharSource metaDataCharSource = getMetaDataCharSource(metadata);
|
||||
//concatenate parsed content and meta data into a single reader.
|
||||
return CharSource.concat(new ReaderCharSource(tikaReader), metaDataCharSource).openStream();
|
||||
} catch (TimeoutException te) {
|
||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
|
||||
logWarning(msg, te);
|
||||
throw new IngesterException(msg);
|
||||
} catch (Exception ex) {
|
||||
final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
|
||||
logger.log(Level.WARNING, msg);
|
||||
success = false;
|
||||
} finally {
|
||||
try {
|
||||
stream.close();
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
|
||||
}
|
||||
try {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
|
||||
}
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
|
||||
final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
|
||||
logWarning(msg, ex);
|
||||
throw new IngesterException(msg, ex);
|
||||
}
|
||||
}
|
||||
|
||||
//after all chunks, ingest the parent file without content itself, and store numChunks
|
||||
ingester.ingest(this);
|
||||
|
||||
return success;
|
||||
/**
|
||||
* Gets a CharSource that wraps a formated representation of the given
|
||||
* Metadata.
|
||||
*
|
||||
* @param metadata The Metadata to wrap as a CharSource
|
||||
*
|
||||
* @return A CharSource for the given MetaData
|
||||
*/
|
||||
static private CharSource getMetaDataCharSource(Metadata metadata) {
|
||||
return CharSource.wrap(
|
||||
new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
|
||||
.append(Stream.of(metadata.names()).sorted()
|
||||
.map(key -> key + ": " + metadata.get(key))
|
||||
.collect(Collectors.joining("\n"))
|
||||
));
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -265,67 +106,64 @@ class TikaTextExtractor implements TextExtractor {
|
||||
|
||||
@Override
|
||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||
if (detectedFormat == null) {
|
||||
return false;
|
||||
} else if (detectedFormat.equals("application/octet-stream") //NON-NLS
|
||||
|| detectedFormat.equals("application/x-msdownload")) { //NON-NLS
|
||||
//any binary unstructured blobs (string extraction will be used)
|
||||
return false;
|
||||
} else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
|
||||
return false;
|
||||
} //skip video other than flv (tika supports flv only)
|
||||
else if (detectedFormat.contains("video/") //NON-NLS
|
||||
&& !detectedFormat.equals("video/x-flv")) { //NON-NLS
|
||||
return false;
|
||||
} else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
|
||||
// Tika currently has a bug in the ttf parser in fontbox.
|
||||
// It will throw an out of memory exception
|
||||
if (detectedFormat == null
|
||||
|| FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
|
||||
|| FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
|
||||
|| (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
|
||||
|| detectedFormat.equals("application/x-font-ttf")) { // Tika currently has a bug in the ttf parser in fontbox; It will throw an out of memory exception//NON-NLS
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
//TODO might need to add more mime-types to ignore
|
||||
//then accept all formats supported by Tika
|
||||
return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
|
||||
}
|
||||
|
||||
@Override
|
||||
InputStream getInputStream(AbstractFile sourceFile1) {
|
||||
return new ReadContentInputStream(sourceFile1);
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isDisabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return timeout that should be used to index the content.
|
||||
*
|
||||
* @param size size of the content
|
||||
*
|
||||
* @return time in seconds to use a timeout
|
||||
*/
|
||||
private static int getTimeout(long size) {
|
||||
if (size < 1024 * 1024L) //1MB
|
||||
{
|
||||
return 60;
|
||||
} else if (size < 10 * 1024 * 1024L) //10MB
|
||||
{
|
||||
return 1200;
|
||||
} else if (size < 100 * 1024 * 1024L) //100MB
|
||||
{
|
||||
return 3600;
|
||||
} else {
|
||||
return 3 * 3600;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Runnable task that calls tika to parse the content using the input
|
||||
* stream. Provides reader for results.
|
||||
* An implementation of CharSource that just wraps an existing reader and
|
||||
* returns it in openStream().
|
||||
*/
|
||||
private static class ParseRequestTask implements Runnable {
|
||||
private static class ReaderCharSource extends CharSource {
|
||||
|
||||
//in
|
||||
private Tika tika;
|
||||
private InputStream stream;
|
||||
private Metadata meta;
|
||||
private AbstractFile sourceFile;
|
||||
//out
|
||||
private Reader reader;
|
||||
private final Reader reader;
|
||||
|
||||
ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
|
||||
this.tika = tika;
|
||||
this.stream = stream;
|
||||
this.meta = meta;
|
||||
this.sourceFile = sourceFile;
|
||||
public ReaderCharSource(Reader reader) {
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
reader = tika.parse(stream, meta);
|
||||
} catch (IOException ex) {
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
|
||||
tika = null;
|
||||
reader = null;
|
||||
} catch (Exception ex) {
|
||||
KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
|
||||
tika = null;
|
||||
reader = null;
|
||||
}
|
||||
}
|
||||
|
||||
public Reader getReader() {
|
||||
public Reader openStream() throws IOException {
|
||||
return reader;
|
||||
}
|
||||
}
|
||||
|
@ -35,7 +35,6 @@ import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.swing.JDialog;
|
||||
import javax.swing.JLabel;
|
||||
import javax.swing.JTextField;
|
||||
import junit.framework.Test;
|
||||
import junit.framework.TestCase;
|
||||
@ -50,10 +49,10 @@ import org.netbeans.jemmy.operators.JComboBoxOperator;
|
||||
import org.netbeans.jemmy.operators.JDialogOperator;
|
||||
import org.netbeans.jemmy.operators.JFileChooserOperator;
|
||||
import org.netbeans.jemmy.operators.JLabelOperator;
|
||||
import org.netbeans.jemmy.operators.JListOperator;
|
||||
import org.netbeans.jemmy.operators.JTabbedPaneOperator;
|
||||
import org.netbeans.jemmy.operators.JTableOperator;
|
||||
import org.netbeans.jemmy.operators.JTextFieldOperator;
|
||||
import org.netbeans.jemmy.operators.JListOperator;
|
||||
import org.netbeans.junit.NbModuleSuite;
|
||||
import org.sleuthkit.autopsy.ingest.IngestManager;
|
||||
|
||||
@ -186,6 +185,8 @@ public class RegressionTest extends TestCase {
|
||||
String img_path = getEscapedPath(System.getProperty("img_path"));
|
||||
String imageDir = img_path;
|
||||
((JTextField) jtfo0.getSource()).setText(imageDir);
|
||||
JComboBoxOperator comboBoxOperator = new JComboBoxOperator(wo, 1);
|
||||
comboBoxOperator.setSelectedItem("(GMT-5:00) America/New_York");
|
||||
wo.btNext().clickMouse();
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user