From 5285f1c6c882585fb051777536b6aaccf393c8ac Mon Sep 17 00:00:00 2001
From: adam-m <amalinowski@basistech.com>
Date: Thu, 15 Dec 2011 13:23:05 -0500
Subject: [PATCH] TSK-267 - Extract English strings from smallish unknown files
 - first iteration, for unknown content < 10MB

---
 .../keywordsearch/FsContentStringStream.java  | 110 ++++++++++++++++++
 .../IndexContentFilesAction.java              |  38 +++++-
 .../autopsy/keywordsearch/Ingester.java       |  52 +++++++--
 3 files changed, 185 insertions(+), 15 deletions(-)
 create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java

diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
new file mode 100644
index 0000000000..67ae6e6b59
--- /dev/null
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
@@ -0,0 +1,110 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.sleuthkit.autopsy.keywordsearch;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.apache.solr.common.util.ContentStream;
+import org.sleuthkit.autopsy.datamodel.DataConversion;
+import org.sleuthkit.datamodel.FsContent;
+import org.sleuthkit.datamodel.TskException;
+
+/**
+ * Converter from FsContent into String with specific encoding
+ * Then, an adapter back to Solr' ContentStream (which is a specific InputStream), 
+ * using the same encoding
+ */
+public class FsContentStringStream implements ContentStream { 
+    //supported encoding, encoding string names match java canonical names
+    public static enum Encoding {ASCII,};
+    
+    private static final int MIN_ASCII_CHARS = 4; //minimum consecutive number of ASCII chars to qualify as string
+    
+    //input
+    private FsContent content;
+    private Encoding encoding;
+    
+    //converted
+    private String convertedString;
+    private InputStream convertedStream;
+    private long convertedLength;
+    
+    private static Logger logger = Logger.getLogger(FsContentStringStream.class.getName());
+
+    public FsContentStringStream(FsContent content, Encoding encoding) {
+        this.content = content;
+        this.encoding = encoding;
+        convertedLength = 0;
+    }
+    
+    public FsContent getFsContent() {
+        return content;
+    }
+    
+    
+    /**
+     * Does all the work and delegation of extracting string and converting 
+     * to appropriate stream with the right encoding
+     * @throws TskException  if conversion failed for any reason
+     */
+    public void convert() throws TskException {
+        //read entire content and extract strings
+        long contentLen = content.getSize();
+        byte [] data = content.read(0, contentLen);
+        convertedString = DataConversion.getString(data, MIN_ASCII_CHARS);
+        
+        //convert the extracted string back to byte stream with the same encoding
+        try {
+            byte [] bytes = convertedString.getBytes(encoding.toString());
+            convertedLength = bytes.length;
+            convertedStream = new ByteArrayInputStream(bytes);
+        }
+        catch (UnsupportedEncodingException e) {
+            logger.log(Level.SEVERE, "Unsupported encoding " + encoding);
+            throw new TskException("Unsupported encoding " + encoding);
+        }
+        
+    }
+    
+    @Override
+    public String getContentType() {
+        return encoding.toString();
+    }
+
+    @Override
+    public String getName() {
+        return content.getName();
+    }
+
+    @Override
+    public Reader getReader() throws IOException {
+        if (convertedStream == null)
+            throw new UnsupportedOperationException("Not supported yet.");
+        return new InputStreamReader(convertedStream);
+        
+    }
+
+    @Override
+    public Long getSize() {
+        return convertedLength;
+    }
+
+    @Override
+    public String getSourceInfo() {
+        return "File:" + content.getId();
+    }
+
+    @Override
+    public InputStream getStream() throws IOException {
+        return convertedStream;
+    }
+    
+}
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java
index 1a99204f6d..24d22dc805 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java
@@ -42,6 +42,7 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.FsContent;
 import org.sleuthkit.datamodel.Image;
+import org.sleuthkit.datamodel.TskException;
 
 /**
  * Action adds all supported files from the given Content object and its
@@ -50,6 +51,8 @@ import org.sleuthkit.datamodel.Image;
 public class IndexContentFilesAction extends AbstractAction {
 
     private static final Logger logger = Logger.getLogger(IndexContentFilesAction.class.getName());
+    private static final int MAX_STRING_EXTRACT_SIZE = 10 * (1 << 10) * (1 << 10);
+    
     private Content c;
     private String name;
     private Server.Core solrCore;
@@ -95,11 +98,11 @@ public class IndexContentFilesAction extends AbstractAction {
                 // track number complete or with errors
                 int fileCount = files.size();
                 int finishedFiles = 0;
-                int problemFiles = 0;
+                int problemFilesCount = 0;
 
                 for (FsContent f : files) {
                     if (isCancelled()) {
-                        return problemFiles;
+                        return problemFilesCount;
                     }
 
                     this.publish("Indexing " + (finishedFiles + 1) + "/" + fileCount + ": " + f.getName());
@@ -108,22 +111,30 @@ public class IndexContentFilesAction extends AbstractAction {
                         ingester.ingest(f);
                     } catch (IngesterException ex) {
                         logger.log(Level.INFO, "Ingester had a problem with file '" + f.getName() + "' (id: " + f.getId() + ").", ex);
-                        problemFiles++;
-                    }
 
+                        if (f.getSize() < MAX_STRING_EXTRACT_SIZE) {
+                            logger.log(Level.INFO, "Will extract strings and re-ingest, from file '" + f.getName() + "' (id: " + f.getId() + ").");
+                            if (!extractAndReingest(ingester, f)) {
+                                problemFilesCount++;
+                            }
+                        } else {
+                            problemFilesCount++;
+                        }
+                    }
                     setProgress(++finishedFiles * 100 / fileCount);
                 }
 
                 ingester.commit();
 
+                //signal a potential change in number of indexed files
                 try {
                     final int numIndexedFiles = KeywordSearch.getServer().getCore().queryNumIndexedFiles();
                     KeywordSearch.changeSupport.firePropertyChange(KeywordSearch.NUM_FILES_CHANGE_EVT, null, new Integer(numIndexedFiles));
                 } catch (SolrServerException se) {
-                    logger.log(Level.SEVERE, "Error executing Solr query, " + se.getMessage());
+                    logger.log(Level.SEVERE, "Error executing Solr query to check number of indexed files: ", se);
                 }
 
-                return problemFiles;
+                return problemFilesCount;
             }
 
             @Override
@@ -198,6 +209,21 @@ public class IndexContentFilesAction extends AbstractAction {
         popUpWindow.setVisible(true);
     }
 
+    private boolean extractAndReingest(Ingester ingester, FsContent f) {
+        boolean success = false;
+        FsContentStringStream fscs = new FsContentStringStream(f, FsContentStringStream.Encoding.ASCII);
+        try {
+            fscs.convert();
+            ingester.ingest(fscs);
+            success = true;
+        } catch (TskException tskEx) {
+            logger.log(Level.INFO, "Problem extracting string from file: '" + f.getName() + "' (id: " + f.getId() + ").", tskEx);
+        } catch (IngesterException ingEx) {
+            logger.log(Level.INFO, "Ingester had a problem with extracted strings from file '" + f.getName() + "' (id: " + f.getId() + ").", ingEx);
+        }
+        return success;
+    }
+
     private void displayProblemFilesDialog(int problemFiles) {
         final Component parentComponent = null; // Use default window frame.
         final String message = "Had trouble indexing " + problemFiles + " of the files. See the log for details.";
diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
index 0d64d078f5..53d25016cb 100644
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@@ -57,6 +57,19 @@ class Ingester {
         }
     }
 
+    
+    /**
+     * Sends a file to Solr to have its content extracted and added to the
+     * index. commit() should be called once you're done ingesting files.
+     * 
+     * @param fcs File FsContentStringStream to ingest
+     * @throws IngesterException if there was an error processing a specific
+     * file, but the Solr server is probably fine.
+     */
+    public void ingest(FsContentStringStream fcs) throws IngesterException {
+        ingest(fcs, getFsContentFields(fcs.getFsContent()));
+    }
+    
     /**
      * Sends a file to Solr to have its content extracted and added to the
      * index. commit() should be called once you're done ingesting files.
@@ -65,17 +78,38 @@ class Ingester {
      * @throws IngesterException if there was an error processing a specific
      * file, but the Solr server is probably fine.
      */
-    void ingest(FsContent f) throws IngesterException {
+    public void ingest(FsContent f) throws IngesterException {
+        ingest(new FscContentStream(f), getFsContentFields(f));
+    }
+    
+    /**
+     * Creates a field map from FsContent, that is later sent to Solr
+     * @param fsc FsContent to get fields from
+     * @return the map
+     */
+    private Map<String, String> getFsContentFields(FsContent fsc) {
         Map<String, String> fields = new HashMap<String, String>();
-        fields.put("id", Long.toString(f.getId()));
-        fields.put("file_name", f.getName());
-        fields.put("ctime", f.getCtimeAsDate());
-        fields.put("atime", f.getAtimeAsDate());
-        fields.put("mtime", f.getMtimeAsDate());
-        fields.put("crtime", f.getMtimeAsDate());
-
+        fields.put("id", Long.toString(fsc.getId()));
+        fields.put("file_name", fsc.getName());
+        fields.put("ctime", fsc.getCtimeAsDate());
+        fields.put("atime", fsc.getAtimeAsDate());
+        fields.put("mtime", fsc.getMtimeAsDate());
+        fields.put("crtime", fsc.getMtimeAsDate());
+        return fields;
+    }
+    
+    
+    /**
+     * Common delegate method actually doing the work for objects implementing ContentStream
+     * 
+     * @param ContentStream to ingest
+     * @param fields content specific fields
+     * @throws IngesterException if there was an error processing a specific
+     * content, but the Solr server is probably fine.
+     */
+    private void ingest(ContentStream cs, Map<String, String> fields) throws IngesterException {
         ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
-        up.addContentStream(new FscContentStream(f));
+        up.addContentStream(cs);
         setFields(up, fields);
         up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);