TSK-267 - Extract English strings from smallish unknown files

- first iteration, for unknown content < 10MB
This commit is contained in:
adam-m 2011-12-15 13:23:05 -05:00
parent 87cc9d5476
commit 5285f1c6c8
3 changed files with 185 additions and 15 deletions

View File

@ -0,0 +1,110 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.solr.common.util.ContentStream;
import org.sleuthkit.autopsy.datamodel.DataConversion;
import org.sleuthkit.datamodel.FsContent;
import org.sleuthkit.datamodel.TskException;
/**
* Converter from FsContent into String with specific encoding
* Then, an adapter back to Solr' ContentStream (which is a specific InputStream),
* using the same encoding
*/
public class FsContentStringStream implements ContentStream {
//supported encoding, encoding string names match java canonical names
public static enum Encoding {ASCII,};
private static final int MIN_ASCII_CHARS = 4; //minimum consecutive number of ASCII chars to qualify as string
//input
private FsContent content;
private Encoding encoding;
//converted
private String convertedString;
private InputStream convertedStream;
private long convertedLength;
private static Logger logger = Logger.getLogger(FsContentStringStream.class.getName());
public FsContentStringStream(FsContent content, Encoding encoding) {
this.content = content;
this.encoding = encoding;
convertedLength = 0;
}
public FsContent getFsContent() {
return content;
}
/**
* Does all the work and delegation of extracting string and converting
* to appropriate stream with the right encoding
* @throws TskException if conversion failed for any reason
*/
public void convert() throws TskException {
//read entire content and extract strings
long contentLen = content.getSize();
byte [] data = content.read(0, contentLen);
convertedString = DataConversion.getString(data, MIN_ASCII_CHARS);
//convert the extracted string back to byte stream with the same encoding
try {
byte [] bytes = convertedString.getBytes(encoding.toString());
convertedLength = bytes.length;
convertedStream = new ByteArrayInputStream(bytes);
}
catch (UnsupportedEncodingException e) {
logger.log(Level.SEVERE, "Unsupported encoding " + encoding);
throw new TskException("Unsupported encoding " + encoding);
}
}
@Override
public String getContentType() {
return encoding.toString();
}
@Override
public String getName() {
return content.getName();
}
@Override
public Reader getReader() throws IOException {
if (convertedStream == null)
throw new UnsupportedOperationException("Not supported yet.");
return new InputStreamReader(convertedStream);
}
@Override
public Long getSize() {
return convertedLength;
}
@Override
public String getSourceInfo() {
return "File:" + content.getId();
}
@Override
public InputStream getStream() throws IOException {
return convertedStream;
}
}

View File

@ -42,6 +42,7 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.FsContent; import org.sleuthkit.datamodel.FsContent;
import org.sleuthkit.datamodel.Image; import org.sleuthkit.datamodel.Image;
import org.sleuthkit.datamodel.TskException;
/** /**
* Action adds all supported files from the given Content object and its * Action adds all supported files from the given Content object and its
@ -50,6 +51,8 @@ import org.sleuthkit.datamodel.Image;
public class IndexContentFilesAction extends AbstractAction { public class IndexContentFilesAction extends AbstractAction {
private static final Logger logger = Logger.getLogger(IndexContentFilesAction.class.getName()); private static final Logger logger = Logger.getLogger(IndexContentFilesAction.class.getName());
private static final int MAX_STRING_EXTRACT_SIZE = 10 * (1 << 10) * (1 << 10);
private Content c; private Content c;
private String name; private String name;
private Server.Core solrCore; private Server.Core solrCore;
@ -95,11 +98,11 @@ public class IndexContentFilesAction extends AbstractAction {
// track number complete or with errors // track number complete or with errors
int fileCount = files.size(); int fileCount = files.size();
int finishedFiles = 0; int finishedFiles = 0;
int problemFiles = 0; int problemFilesCount = 0;
for (FsContent f : files) { for (FsContent f : files) {
if (isCancelled()) { if (isCancelled()) {
return problemFiles; return problemFilesCount;
} }
this.publish("Indexing " + (finishedFiles + 1) + "/" + fileCount + ": " + f.getName()); this.publish("Indexing " + (finishedFiles + 1) + "/" + fileCount + ": " + f.getName());
@ -108,22 +111,30 @@ public class IndexContentFilesAction extends AbstractAction {
ingester.ingest(f); ingester.ingest(f);
} catch (IngesterException ex) { } catch (IngesterException ex) {
logger.log(Level.INFO, "Ingester had a problem with file '" + f.getName() + "' (id: " + f.getId() + ").", ex); logger.log(Level.INFO, "Ingester had a problem with file '" + f.getName() + "' (id: " + f.getId() + ").", ex);
problemFiles++;
}
if (f.getSize() < MAX_STRING_EXTRACT_SIZE) {
logger.log(Level.INFO, "Will extract strings and re-ingest, from file '" + f.getName() + "' (id: " + f.getId() + ").");
if (!extractAndReingest(ingester, f)) {
problemFilesCount++;
}
} else {
problemFilesCount++;
}
}
setProgress(++finishedFiles * 100 / fileCount); setProgress(++finishedFiles * 100 / fileCount);
} }
ingester.commit(); ingester.commit();
//signal a potential change in number of indexed files
try { try {
final int numIndexedFiles = KeywordSearch.getServer().getCore().queryNumIndexedFiles(); final int numIndexedFiles = KeywordSearch.getServer().getCore().queryNumIndexedFiles();
KeywordSearch.changeSupport.firePropertyChange(KeywordSearch.NUM_FILES_CHANGE_EVT, null, new Integer(numIndexedFiles)); KeywordSearch.changeSupport.firePropertyChange(KeywordSearch.NUM_FILES_CHANGE_EVT, null, new Integer(numIndexedFiles));
} catch (SolrServerException se) { } catch (SolrServerException se) {
logger.log(Level.SEVERE, "Error executing Solr query, " + se.getMessage()); logger.log(Level.SEVERE, "Error executing Solr query to check number of indexed files: ", se);
} }
return problemFiles; return problemFilesCount;
} }
@Override @Override
@ -198,6 +209,21 @@ public class IndexContentFilesAction extends AbstractAction {
popUpWindow.setVisible(true); popUpWindow.setVisible(true);
} }
private boolean extractAndReingest(Ingester ingester, FsContent f) {
boolean success = false;
FsContentStringStream fscs = new FsContentStringStream(f, FsContentStringStream.Encoding.ASCII);
try {
fscs.convert();
ingester.ingest(fscs);
success = true;
} catch (TskException tskEx) {
logger.log(Level.INFO, "Problem extracting string from file: '" + f.getName() + "' (id: " + f.getId() + ").", tskEx);
} catch (IngesterException ingEx) {
logger.log(Level.INFO, "Ingester had a problem with extracted strings from file '" + f.getName() + "' (id: " + f.getId() + ").", ingEx);
}
return success;
}
private void displayProblemFilesDialog(int problemFiles) { private void displayProblemFilesDialog(int problemFiles) {
final Component parentComponent = null; // Use default window frame. final Component parentComponent = null; // Use default window frame.
final String message = "Had trouble indexing " + problemFiles + " of the files. See the log for details."; final String message = "Had trouble indexing " + problemFiles + " of the files. See the log for details.";

View File

@ -57,6 +57,19 @@ class Ingester {
} }
} }
/**
* Sends a file to Solr to have its content extracted and added to the
* index. commit() should be called once you're done ingesting files.
*
* @param fcs File FsContentStringStream to ingest
* @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine.
*/
public void ingest(FsContentStringStream fcs) throws IngesterException {
ingest(fcs, getFsContentFields(fcs.getFsContent()));
}
/** /**
* Sends a file to Solr to have its content extracted and added to the * Sends a file to Solr to have its content extracted and added to the
* index. commit() should be called once you're done ingesting files. * index. commit() should be called once you're done ingesting files.
@ -65,17 +78,38 @@ class Ingester {
* @throws IngesterException if there was an error processing a specific * @throws IngesterException if there was an error processing a specific
* file, but the Solr server is probably fine. * file, but the Solr server is probably fine.
*/ */
void ingest(FsContent f) throws IngesterException { public void ingest(FsContent f) throws IngesterException {
Map<String, String> fields = new HashMap<String, String>(); ingest(new FscContentStream(f), getFsContentFields(f));
fields.put("id", Long.toString(f.getId())); }
fields.put("file_name", f.getName());
fields.put("ctime", f.getCtimeAsDate());
fields.put("atime", f.getAtimeAsDate());
fields.put("mtime", f.getMtimeAsDate());
fields.put("crtime", f.getMtimeAsDate());
/**
* Creates a field map from FsContent, that is later sent to Solr
* @param fsc FsContent to get fields from
* @return the map
*/
private Map<String, String> getFsContentFields(FsContent fsc) {
Map<String, String> fields = new HashMap<String, String>();
fields.put("id", Long.toString(fsc.getId()));
fields.put("file_name", fsc.getName());
fields.put("ctime", fsc.getCtimeAsDate());
fields.put("atime", fsc.getAtimeAsDate());
fields.put("mtime", fsc.getMtimeAsDate());
fields.put("crtime", fsc.getMtimeAsDate());
return fields;
}
/**
* Common delegate method actually doing the work for objects implementing ContentStream
*
* @param ContentStream to ingest
* @param fields content specific fields
* @throws IngesterException if there was an error processing a specific
* content, but the Solr server is probably fine.
*/
private void ingest(ContentStream cs, Map<String, String> fields) throws IngesterException {
ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract"); ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
up.addContentStream(new FscContentStream(f)); up.addContentStream(cs);
setFields(up, fields); setFields(up, fields);
up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true); up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);