mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
TSK-267 - Extract English strings from smallish unknown files
- first iteration, for unknown content < 10MB
This commit is contained in:
parent
87cc9d5476
commit
5285f1c6c8
@ -0,0 +1,110 @@
|
|||||||
|
/*
|
||||||
|
* To change this template, choose Tools | Templates
|
||||||
|
* and open the template in the editor.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
import org.apache.solr.common.util.ContentStream;
|
||||||
|
import org.sleuthkit.autopsy.datamodel.DataConversion;
|
||||||
|
import org.sleuthkit.datamodel.FsContent;
|
||||||
|
import org.sleuthkit.datamodel.TskException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converter from FsContent into String with specific encoding
|
||||||
|
* Then, an adapter back to Solr' ContentStream (which is a specific InputStream),
|
||||||
|
* using the same encoding
|
||||||
|
*/
|
||||||
|
public class FsContentStringStream implements ContentStream {
|
||||||
|
//supported encoding, encoding string names match java canonical names
|
||||||
|
public static enum Encoding {ASCII,};
|
||||||
|
|
||||||
|
private static final int MIN_ASCII_CHARS = 4; //minimum consecutive number of ASCII chars to qualify as string
|
||||||
|
|
||||||
|
//input
|
||||||
|
private FsContent content;
|
||||||
|
private Encoding encoding;
|
||||||
|
|
||||||
|
//converted
|
||||||
|
private String convertedString;
|
||||||
|
private InputStream convertedStream;
|
||||||
|
private long convertedLength;
|
||||||
|
|
||||||
|
private static Logger logger = Logger.getLogger(FsContentStringStream.class.getName());
|
||||||
|
|
||||||
|
public FsContentStringStream(FsContent content, Encoding encoding) {
|
||||||
|
this.content = content;
|
||||||
|
this.encoding = encoding;
|
||||||
|
convertedLength = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public FsContent getFsContent() {
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does all the work and delegation of extracting string and converting
|
||||||
|
* to appropriate stream with the right encoding
|
||||||
|
* @throws TskException if conversion failed for any reason
|
||||||
|
*/
|
||||||
|
public void convert() throws TskException {
|
||||||
|
//read entire content and extract strings
|
||||||
|
long contentLen = content.getSize();
|
||||||
|
byte [] data = content.read(0, contentLen);
|
||||||
|
convertedString = DataConversion.getString(data, MIN_ASCII_CHARS);
|
||||||
|
|
||||||
|
//convert the extracted string back to byte stream with the same encoding
|
||||||
|
try {
|
||||||
|
byte [] bytes = convertedString.getBytes(encoding.toString());
|
||||||
|
convertedLength = bytes.length;
|
||||||
|
convertedStream = new ByteArrayInputStream(bytes);
|
||||||
|
}
|
||||||
|
catch (UnsupportedEncodingException e) {
|
||||||
|
logger.log(Level.SEVERE, "Unsupported encoding " + encoding);
|
||||||
|
throw new TskException("Unsupported encoding " + encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getContentType() {
|
||||||
|
return encoding.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getName() {
|
||||||
|
return content.getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader getReader() throws IOException {
|
||||||
|
if (convertedStream == null)
|
||||||
|
throw new UnsupportedOperationException("Not supported yet.");
|
||||||
|
return new InputStreamReader(convertedStream);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Long getSize() {
|
||||||
|
return convertedLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSourceInfo() {
|
||||||
|
return "File:" + content.getId();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InputStream getStream() throws IOException {
|
||||||
|
return convertedStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -42,6 +42,7 @@ import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
|||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
import org.sleuthkit.datamodel.FsContent;
|
import org.sleuthkit.datamodel.FsContent;
|
||||||
import org.sleuthkit.datamodel.Image;
|
import org.sleuthkit.datamodel.Image;
|
||||||
|
import org.sleuthkit.datamodel.TskException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Action adds all supported files from the given Content object and its
|
* Action adds all supported files from the given Content object and its
|
||||||
@ -50,6 +51,8 @@ import org.sleuthkit.datamodel.Image;
|
|||||||
public class IndexContentFilesAction extends AbstractAction {
|
public class IndexContentFilesAction extends AbstractAction {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(IndexContentFilesAction.class.getName());
|
private static final Logger logger = Logger.getLogger(IndexContentFilesAction.class.getName());
|
||||||
|
private static final int MAX_STRING_EXTRACT_SIZE = 10 * (1 << 10) * (1 << 10);
|
||||||
|
|
||||||
private Content c;
|
private Content c;
|
||||||
private String name;
|
private String name;
|
||||||
private Server.Core solrCore;
|
private Server.Core solrCore;
|
||||||
@ -95,11 +98,11 @@ public class IndexContentFilesAction extends AbstractAction {
|
|||||||
// track number complete or with errors
|
// track number complete or with errors
|
||||||
int fileCount = files.size();
|
int fileCount = files.size();
|
||||||
int finishedFiles = 0;
|
int finishedFiles = 0;
|
||||||
int problemFiles = 0;
|
int problemFilesCount = 0;
|
||||||
|
|
||||||
for (FsContent f : files) {
|
for (FsContent f : files) {
|
||||||
if (isCancelled()) {
|
if (isCancelled()) {
|
||||||
return problemFiles;
|
return problemFilesCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.publish("Indexing " + (finishedFiles + 1) + "/" + fileCount + ": " + f.getName());
|
this.publish("Indexing " + (finishedFiles + 1) + "/" + fileCount + ": " + f.getName());
|
||||||
@ -108,22 +111,30 @@ public class IndexContentFilesAction extends AbstractAction {
|
|||||||
ingester.ingest(f);
|
ingester.ingest(f);
|
||||||
} catch (IngesterException ex) {
|
} catch (IngesterException ex) {
|
||||||
logger.log(Level.INFO, "Ingester had a problem with file '" + f.getName() + "' (id: " + f.getId() + ").", ex);
|
logger.log(Level.INFO, "Ingester had a problem with file '" + f.getName() + "' (id: " + f.getId() + ").", ex);
|
||||||
problemFiles++;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (f.getSize() < MAX_STRING_EXTRACT_SIZE) {
|
||||||
|
logger.log(Level.INFO, "Will extract strings and re-ingest, from file '" + f.getName() + "' (id: " + f.getId() + ").");
|
||||||
|
if (!extractAndReingest(ingester, f)) {
|
||||||
|
problemFilesCount++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
problemFilesCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
setProgress(++finishedFiles * 100 / fileCount);
|
setProgress(++finishedFiles * 100 / fileCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
ingester.commit();
|
ingester.commit();
|
||||||
|
|
||||||
|
//signal a potential change in number of indexed files
|
||||||
try {
|
try {
|
||||||
final int numIndexedFiles = KeywordSearch.getServer().getCore().queryNumIndexedFiles();
|
final int numIndexedFiles = KeywordSearch.getServer().getCore().queryNumIndexedFiles();
|
||||||
KeywordSearch.changeSupport.firePropertyChange(KeywordSearch.NUM_FILES_CHANGE_EVT, null, new Integer(numIndexedFiles));
|
KeywordSearch.changeSupport.firePropertyChange(KeywordSearch.NUM_FILES_CHANGE_EVT, null, new Integer(numIndexedFiles));
|
||||||
} catch (SolrServerException se) {
|
} catch (SolrServerException se) {
|
||||||
logger.log(Level.SEVERE, "Error executing Solr query, " + se.getMessage());
|
logger.log(Level.SEVERE, "Error executing Solr query to check number of indexed files: ", se);
|
||||||
}
|
}
|
||||||
|
|
||||||
return problemFiles;
|
return problemFilesCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -198,6 +209,21 @@ public class IndexContentFilesAction extends AbstractAction {
|
|||||||
popUpWindow.setVisible(true);
|
popUpWindow.setVisible(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean extractAndReingest(Ingester ingester, FsContent f) {
|
||||||
|
boolean success = false;
|
||||||
|
FsContentStringStream fscs = new FsContentStringStream(f, FsContentStringStream.Encoding.ASCII);
|
||||||
|
try {
|
||||||
|
fscs.convert();
|
||||||
|
ingester.ingest(fscs);
|
||||||
|
success = true;
|
||||||
|
} catch (TskException tskEx) {
|
||||||
|
logger.log(Level.INFO, "Problem extracting string from file: '" + f.getName() + "' (id: " + f.getId() + ").", tskEx);
|
||||||
|
} catch (IngesterException ingEx) {
|
||||||
|
logger.log(Level.INFO, "Ingester had a problem with extracted strings from file '" + f.getName() + "' (id: " + f.getId() + ").", ingEx);
|
||||||
|
}
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
private void displayProblemFilesDialog(int problemFiles) {
|
private void displayProblemFilesDialog(int problemFiles) {
|
||||||
final Component parentComponent = null; // Use default window frame.
|
final Component parentComponent = null; // Use default window frame.
|
||||||
final String message = "Had trouble indexing " + problemFiles + " of the files. See the log for details.";
|
final String message = "Had trouble indexing " + problemFiles + " of the files. See the log for details.";
|
||||||
|
@ -57,6 +57,19 @@ class Ingester {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sends a file to Solr to have its content extracted and added to the
|
||||||
|
* index. commit() should be called once you're done ingesting files.
|
||||||
|
*
|
||||||
|
* @param fcs File FsContentStringStream to ingest
|
||||||
|
* @throws IngesterException if there was an error processing a specific
|
||||||
|
* file, but the Solr server is probably fine.
|
||||||
|
*/
|
||||||
|
public void ingest(FsContentStringStream fcs) throws IngesterException {
|
||||||
|
ingest(fcs, getFsContentFields(fcs.getFsContent()));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sends a file to Solr to have its content extracted and added to the
|
* Sends a file to Solr to have its content extracted and added to the
|
||||||
* index. commit() should be called once you're done ingesting files.
|
* index. commit() should be called once you're done ingesting files.
|
||||||
@ -65,17 +78,38 @@ class Ingester {
|
|||||||
* @throws IngesterException if there was an error processing a specific
|
* @throws IngesterException if there was an error processing a specific
|
||||||
* file, but the Solr server is probably fine.
|
* file, but the Solr server is probably fine.
|
||||||
*/
|
*/
|
||||||
void ingest(FsContent f) throws IngesterException {
|
public void ingest(FsContent f) throws IngesterException {
|
||||||
Map<String, String> fields = new HashMap<String, String>();
|
ingest(new FscContentStream(f), getFsContentFields(f));
|
||||||
fields.put("id", Long.toString(f.getId()));
|
}
|
||||||
fields.put("file_name", f.getName());
|
|
||||||
fields.put("ctime", f.getCtimeAsDate());
|
|
||||||
fields.put("atime", f.getAtimeAsDate());
|
|
||||||
fields.put("mtime", f.getMtimeAsDate());
|
|
||||||
fields.put("crtime", f.getMtimeAsDate());
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a field map from FsContent, that is later sent to Solr
|
||||||
|
* @param fsc FsContent to get fields from
|
||||||
|
* @return the map
|
||||||
|
*/
|
||||||
|
private Map<String, String> getFsContentFields(FsContent fsc) {
|
||||||
|
Map<String, String> fields = new HashMap<String, String>();
|
||||||
|
fields.put("id", Long.toString(fsc.getId()));
|
||||||
|
fields.put("file_name", fsc.getName());
|
||||||
|
fields.put("ctime", fsc.getCtimeAsDate());
|
||||||
|
fields.put("atime", fsc.getAtimeAsDate());
|
||||||
|
fields.put("mtime", fsc.getMtimeAsDate());
|
||||||
|
fields.put("crtime", fsc.getMtimeAsDate());
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Common delegate method actually doing the work for objects implementing ContentStream
|
||||||
|
*
|
||||||
|
* @param ContentStream to ingest
|
||||||
|
* @param fields content specific fields
|
||||||
|
* @throws IngesterException if there was an error processing a specific
|
||||||
|
* content, but the Solr server is probably fine.
|
||||||
|
*/
|
||||||
|
private void ingest(ContentStream cs, Map<String, String> fields) throws IngesterException {
|
||||||
ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
|
ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
|
||||||
up.addContentStream(new FscContentStream(f));
|
up.addContentStream(cs);
|
||||||
setFields(up, fields);
|
setFields(up, fields);
|
||||||
up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);
|
up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user