TSK-267 - Extract English strings from smallish unknown files

- first iteration, for unknown content < 10MB
2025-07-06 21:00:22 +00:00 · 2011-12-15 13:23:05 -05:00 · 2011-12-15 13:23:05 -05:00 · 5285f1c6c8
commit 5285f1c6c8
parent 87cc9d5476
3 changed files with 185 additions and 15 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FsContentStringStream.java
@ -0,0 +1,110 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import org.apache.solr.common.util.ContentStream;
 import org.sleuthkit.autopsy.datamodel.DataConversion;
 import org.sleuthkit.datamodel.FsContent;
 import org.sleuthkit.datamodel.TskException;
 /**
 * Converter from FsContent into String with specific encoding
 * Then, an adapter back to Solr' ContentStream (which is a specific InputStream), 
 * using the same encoding
 */
 public class FsContentStringStream implements ContentStream { 
    //supported encoding, encoding string names match java canonical names
    public static enum Encoding {ASCII,};
    private static final int MIN_ASCII_CHARS = 4; //minimum consecutive number of ASCII chars to qualify as string
    //input
    private FsContent content;
    private Encoding encoding;
    //converted
    private String convertedString;
    private InputStream convertedStream;
    private long convertedLength;
    private static Logger logger = Logger.getLogger(FsContentStringStream.class.getName());
    public FsContentStringStream(FsContent content, Encoding encoding) {
        this.content = content;
        this.encoding = encoding;
        convertedLength = 0;
    }
    public FsContent getFsContent() {
        return content;
    }
    /**
     * Does all the work and delegation of extracting string and converting 
     * to appropriate stream with the right encoding
     * @throws TskException  if conversion failed for any reason
     */
    public void convert() throws TskException {
        //read entire content and extract strings
        long contentLen = content.getSize();
        byte [] data = content.read(0, contentLen);
        convertedString = DataConversion.getString(data, MIN_ASCII_CHARS);
        //convert the extracted string back to byte stream with the same encoding
        try {
            byte [] bytes = convertedString.getBytes(encoding.toString());
            convertedLength = bytes.length;
            convertedStream = new ByteArrayInputStream(bytes);
        }
        catch (UnsupportedEncodingException e) {
            logger.log(Level.SEVERE, "Unsupported encoding " + encoding);
            throw new TskException("Unsupported encoding " + encoding);
        }
    }
    @Override
    public String getContentType() {
        return encoding.toString();
    }
    @Override
    public String getName() {
        return content.getName();
    }
    @Override
    public Reader getReader() throws IOException {
        if (convertedStream == null)
            throw new UnsupportedOperationException("Not supported yet.");
        return new InputStreamReader(convertedStream);
    }
    @Override
    public Long getSize() {
        return convertedLength;
    }
    @Override
    public String getSourceInfo() {
        return "File:" + content.getId();
    }
    @Override
    public InputStream getStream() throws IOException {
        return convertedStream;
    }
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexContentFilesAction.java
@ -42,6 +42,7 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.Content;
 import org.sleuthkit.datamodel.FsContent;
 import org.sleuthkit.datamodel.Image;
 import org.sleuthkit.datamodel.TskException;
 /**
 * Action adds all supported files from the given Content object and its
@ -50,6 +51,8 @@ import org.sleuthkit.datamodel.Image;
 public class IndexContentFilesAction extends AbstractAction {
    private static final Logger logger = Logger.getLogger(IndexContentFilesAction.class.getName());
    private static final int MAX_STRING_EXTRACT_SIZE = 10 * (1 << 10) * (1 << 10);
    private Content c;
    private String name;
    private Server.Core solrCore;
@ -95,11 +98,11 @@ public class IndexContentFilesAction extends AbstractAction {
                // track number complete or with errors
                int fileCount = files.size();
                int finishedFiles = 0;
-                int problemFiles = 0;
+                int problemFilesCount = 0;
                for (FsContent f : files) {
                    if (isCancelled()) {
-                        return problemFiles;
+                        return problemFilesCount;
                    }
                    this.publish("Indexing " + (finishedFiles + 1) + "/" + fileCount + ": " + f.getName());
@ -108,22 +111,30 @@ public class IndexContentFilesAction extends AbstractAction {
                        ingester.ingest(f);
                    } catch (IngesterException ex) {
                        logger.log(Level.INFO, "Ingester had a problem with file '" + f.getName() + "' (id: " + f.getId() + ").", ex);
                        problemFiles++;
                    }
                        if (f.getSize() < MAX_STRING_EXTRACT_SIZE) {
                            logger.log(Level.INFO, "Will extract strings and re-ingest, from file '" + f.getName() + "' (id: " + f.getId() + ").");
                            if (!extractAndReingest(ingester, f)) {
                                problemFilesCount++;
                            }
                        } else {
                            problemFilesCount++;
                        }
                    }
                    setProgress(++finishedFiles * 100 / fileCount);
                }
                ingester.commit();
                //signal a potential change in number of indexed files
                try {
                    final int numIndexedFiles = KeywordSearch.getServer().getCore().queryNumIndexedFiles();
                    KeywordSearch.changeSupport.firePropertyChange(KeywordSearch.NUM_FILES_CHANGE_EVT, null, new Integer(numIndexedFiles));
                } catch (SolrServerException se) {
-                    logger.log(Level.SEVERE, "Error executing Solr query, " + se.getMessage());
+                    logger.log(Level.SEVERE, "Error executing Solr query to check number of indexed files: ", se);
                }
-                return problemFiles;
+                return problemFilesCount;
            }
            @Override
@ -198,6 +209,21 @@ public class IndexContentFilesAction extends AbstractAction {
        popUpWindow.setVisible(true);
    }
    private boolean extractAndReingest(Ingester ingester, FsContent f) {
        boolean success = false;
        FsContentStringStream fscs = new FsContentStringStream(f, FsContentStringStream.Encoding.ASCII);
        try {
            fscs.convert();
            ingester.ingest(fscs);
            success = true;
        } catch (TskException tskEx) {
            logger.log(Level.INFO, "Problem extracting string from file: '" + f.getName() + "' (id: " + f.getId() + ").", tskEx);
        } catch (IngesterException ingEx) {
            logger.log(Level.INFO, "Ingester had a problem with extracted strings from file '" + f.getName() + "' (id: " + f.getId() + ").", ingEx);
        }
        return success;
    }
    private void displayProblemFilesDialog(int problemFiles) {
        final Component parentComponent = null; // Use default window frame.
        final String message = "Had trouble indexing " + problemFiles + " of the files. See the log for details.";
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java
@ -57,6 +57,19 @@ class Ingester {
        }
    }
    /**
     * Sends a file to Solr to have its content extracted and added to the
     * index. commit() should be called once you're done ingesting files.
     * 
     * @param fcs File FsContentStringStream to ingest
     * @throws IngesterException if there was an error processing a specific
     * file, but the Solr server is probably fine.
     */
    public void ingest(FsContentStringStream fcs) throws IngesterException {
        ingest(fcs, getFsContentFields(fcs.getFsContent()));
    }
    /**
     * Sends a file to Solr to have its content extracted and added to the
     * index. commit() should be called once you're done ingesting files.
@ -65,17 +78,38 @@ class Ingester {
     * @throws IngesterException if there was an error processing a specific
     * file, but the Solr server is probably fine.
     */
-    void ingest(FsContent f) throws IngesterException {
+    public void ingest(FsContent f) throws IngesterException {
-        Map<String, String> fields = new HashMap<String, String>();
+        ingest(new FscContentStream(f), getFsContentFields(f));
-        fields.put("id", Long.toString(f.getId()));
+    }
        fields.put("file_name", f.getName());
        fields.put("ctime", f.getCtimeAsDate());
        fields.put("atime", f.getAtimeAsDate());
        fields.put("mtime", f.getMtimeAsDate());
        fields.put("crtime", f.getMtimeAsDate());
    /**
     * Creates a field map from FsContent, that is later sent to Solr
     * @param fsc FsContent to get fields from
     * @return the map
     */
    private Map<String, String> getFsContentFields(FsContent fsc) {
        Map<String, String> fields = new HashMap<String, String>();
        fields.put("id", Long.toString(fsc.getId()));
        fields.put("file_name", fsc.getName());
        fields.put("ctime", fsc.getCtimeAsDate());
        fields.put("atime", fsc.getAtimeAsDate());
        fields.put("mtime", fsc.getMtimeAsDate());
        fields.put("crtime", fsc.getMtimeAsDate());
        return fields;
    }
    /**
     * Common delegate method actually doing the work for objects implementing ContentStream
     * 
     * @param ContentStream to ingest
     * @param fields content specific fields
     * @throws IngesterException if there was an error processing a specific
     * content, but the Solr server is probably fine.
     */
    private void ingest(ContentStream cs, Map<String, String> fields) throws IngesterException {
        ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
-        up.addContentStream(new FscContentStream(f));
+        up.addContentStream(cs);
        setFields(up, fields);
        up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);