From 5285f1c6c882585fb051777536b6aaccf393c8ac Mon Sep 17 00:00:00 2001 From: adam-m Date: Thu, 15 Dec 2011 13:23:05 -0500 Subject: [PATCH] TSK-267 - Extract English strings from smallish unknown files - first iteration, for unknown content < 10MB --- .../keywordsearch/FsContentStringStream.java | 110 ++++++++++++++++++ .../IndexContentFilesAction.java | 38 +++++- .../autopsy/keywordsearch/Ingester.java | 52 +++++++-- 3 files changed, 185 insertions(+), 15 deletions(-) create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java new file mode 100644 index 0000000000..67ae6e6b59 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java @@ -0,0 +1,110 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.solr.common.util.ContentStream; +import org.sleuthkit.autopsy.datamodel.DataConversion; +import org.sleuthkit.datamodel.FsContent; +import org.sleuthkit.datamodel.TskException; + +/** + * Converter from FsContent into String with specific encoding + * Then, an adapter back to Solr' ContentStream (which is a specific InputStream), + * using the same encoding + */ +public class FsContentStringStream implements ContentStream { + //supported encoding, encoding string names match java canonical names + public static enum Encoding {ASCII,}; + + private static final int MIN_ASCII_CHARS = 4; //minimum consecutive number of ASCII chars to qualify as string + + //input + private FsContent content; + private Encoding encoding; + + //converted + private String convertedString; + private InputStream convertedStream; + private long convertedLength; + + private static Logger logger = Logger.getLogger(FsContentStringStream.class.getName()); + + public FsContentStringStream(FsContent content, Encoding encoding) { + this.content = content; + this.encoding = encoding; + convertedLength = 0; + } + + public FsContent getFsContent() { + return content; + } + + + /** + * Does all the work and delegation of extracting string and converting + * to appropriate stream with the right encoding + * @throws TskException if conversion failed for any reason + */ + public void convert() throws TskException { + //read entire content and extract strings + long contentLen = content.getSize(); + byte [] data = content.read(0, contentLen); + convertedString = DataConversion.getString(data, MIN_ASCII_CHARS); + + //convert the extracted string back to byte stream with the same encoding + try { + byte [] bytes = convertedString.getBytes(encoding.toString()); + convertedLength = bytes.length; + convertedStream = new ByteArrayInputStream(bytes); + } + catch (UnsupportedEncodingException e) { + logger.log(Level.SEVERE, "Unsupported encoding " + encoding); + throw new TskException("Unsupported encoding " + encoding); + } + + } + + @Override + public String getContentType() { + return encoding.toString(); + } + + @Override + public String getName() { + return content.getName(); + } + + @Override + public Reader getReader() throws IOException { + if (convertedStream == null) + throw new UnsupportedOperationException("Not supported yet."); + return new InputStreamReader(convertedStream); + + } + + @Override + public Long getSize() { + return convertedLength; + } + + @Override + public String getSourceInfo() { + return "File:" + content.getId(); + } + + @Override + public InputStream getStream() throws IOException { + return convertedStream; + } + +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java index 1a99204f6d..24d22dc805 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java @@ -42,6 +42,7 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.FsContent; import org.sleuthkit.datamodel.Image; +import org.sleuthkit.datamodel.TskException; /** * Action adds all supported files from the given Content object and its @@ -50,6 +51,8 @@ import org.sleuthkit.datamodel.Image; public class IndexContentFilesAction extends AbstractAction { private static final Logger logger = Logger.getLogger(IndexContentFilesAction.class.getName()); + private static final int MAX_STRING_EXTRACT_SIZE = 10 * (1 << 10) * (1 << 10); + private Content c; private String name; private Server.Core solrCore; @@ -95,11 +98,11 @@ public class IndexContentFilesAction extends AbstractAction { // track number complete or with errors int fileCount = files.size(); int finishedFiles = 0; - int problemFiles = 0; + int problemFilesCount = 0; for (FsContent f : files) { if (isCancelled()) { - return problemFiles; + return problemFilesCount; } this.publish("Indexing " + (finishedFiles + 1) + "/" + fileCount + ": " + f.getName()); @@ -108,22 +111,30 @@ public class IndexContentFilesAction extends AbstractAction { ingester.ingest(f); } catch (IngesterException ex) { logger.log(Level.INFO, "Ingester had a problem with file '" + f.getName() + "' (id: " + f.getId() + ").", ex); - problemFiles++; - } + if (f.getSize() < MAX_STRING_EXTRACT_SIZE) { + logger.log(Level.INFO, "Will extract strings and re-ingest, from file '" + f.getName() + "' (id: " + f.getId() + ")."); + if (!extractAndReingest(ingester, f)) { + problemFilesCount++; + } + } else { + problemFilesCount++; + } + } setProgress(++finishedFiles * 100 / fileCount); } ingester.commit(); + //signal a potential change in number of indexed files try { final int numIndexedFiles = KeywordSearch.getServer().getCore().queryNumIndexedFiles(); KeywordSearch.changeSupport.firePropertyChange(KeywordSearch.NUM_FILES_CHANGE_EVT, null, new Integer(numIndexedFiles)); } catch (SolrServerException se) { - logger.log(Level.SEVERE, "Error executing Solr query, " + se.getMessage()); + logger.log(Level.SEVERE, "Error executing Solr query to check number of indexed files: ", se); } - return problemFiles; + return problemFilesCount; } @Override @@ -198,6 +209,21 @@ public class IndexContentFilesAction extends AbstractAction { popUpWindow.setVisible(true); } + private boolean extractAndReingest(Ingester ingester, FsContent f) { + boolean success = false; + FsContentStringStream fscs = new FsContentStringStream(f, FsContentStringStream.Encoding.ASCII); + try { + fscs.convert(); + ingester.ingest(fscs); + success = true; + } catch (TskException tskEx) { + logger.log(Level.INFO, "Problem extracting string from file: '" + f.getName() + "' (id: " + f.getId() + ").", tskEx); + } catch (IngesterException ingEx) { + logger.log(Level.INFO, "Ingester had a problem with extracted strings from file '" + f.getName() + "' (id: " + f.getId() + ").", ingEx); + } + return success; + } + private void displayProblemFilesDialog(int problemFiles) { final Component parentComponent = null; // Use default window frame. final String message = "Had trouble indexing " + problemFiles + " of the files. See the log for details."; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 0d64d078f5..53d25016cb 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -57,6 +57,19 @@ class Ingester { } } + + /** + * Sends a file to Solr to have its content extracted and added to the + * index. commit() should be called once you're done ingesting files. + * + * @param fcs File FsContentStringStream to ingest + * @throws IngesterException if there was an error processing a specific + * file, but the Solr server is probably fine. + */ + public void ingest(FsContentStringStream fcs) throws IngesterException { + ingest(fcs, getFsContentFields(fcs.getFsContent())); + } + /** * Sends a file to Solr to have its content extracted and added to the * index. commit() should be called once you're done ingesting files. @@ -65,17 +78,38 @@ class Ingester { * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. */ - void ingest(FsContent f) throws IngesterException { + public void ingest(FsContent f) throws IngesterException { + ingest(new FscContentStream(f), getFsContentFields(f)); + } + + /** + * Creates a field map from FsContent, that is later sent to Solr + * @param fsc FsContent to get fields from + * @return the map + */ + private Map getFsContentFields(FsContent fsc) { Map fields = new HashMap(); - fields.put("id", Long.toString(f.getId())); - fields.put("file_name", f.getName()); - fields.put("ctime", f.getCtimeAsDate()); - fields.put("atime", f.getAtimeAsDate()); - fields.put("mtime", f.getMtimeAsDate()); - fields.put("crtime", f.getMtimeAsDate()); - + fields.put("id", Long.toString(fsc.getId())); + fields.put("file_name", fsc.getName()); + fields.put("ctime", fsc.getCtimeAsDate()); + fields.put("atime", fsc.getAtimeAsDate()); + fields.put("mtime", fsc.getMtimeAsDate()); + fields.put("crtime", fsc.getMtimeAsDate()); + return fields; + } + + + /** + * Common delegate method actually doing the work for objects implementing ContentStream + * + * @param ContentStream to ingest + * @param fields content specific fields + * @throws IngesterException if there was an error processing a specific + * content, but the Solr server is probably fine. + */ + private void ingest(ContentStream cs, Map fields) throws IngesterException { ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract"); - up.addContentStream(new FscContentStream(f)); + up.addContentStream(cs); setFields(up, fields); up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);