From c3a2f3a13ceae5514a672ed0fb9ffe4fb15a8b53 Mon Sep 17 00:00:00 2001 From: adam-m Date: Mon, 14 May 2012 17:37:27 -0400 Subject: [PATCH] Preliminary version of large files string extraction support - no full hit navitation yet. Also, some refactoring and cleanup of old keyword search code. --- .../datamodel/FsContentStringStream.java | 26 +- .../release/solr/solr/conf/schema.xml | 5 +- .../autopsy/keywordsearch/Bundle.properties | 8 + .../keywordsearch/ByteContentStream.java | 102 ++++++++ .../autopsy/keywordsearch/ContentHit.java | 113 ++++++++ .../keywordsearch/ExtractedContentPaging.java | 147 +++++++++++ .../keywordsearch/ExtractedContentPanel.form | 137 +++++++++- .../keywordsearch/ExtractedContentPanel.java | 174 +++++++++++-- .../keywordsearch/ExtractedContentViewer.java | 243 ++++++++++++++---- .../autopsy/keywordsearch/FileExtract.java | 157 +++++++++++ .../FsContentStringContentStream.java | 12 +- .../HighlightedMatchesSource.java | 38 ++- .../autopsy/keywordsearch/Ingester.java | 100 +++++-- .../KeywordSearchFilterNode.java | 6 +- .../KeywordSearchIngestService.java | 144 ++++++----- .../keywordsearch/KeywordSearchNode.java | 11 +- .../keywordsearch/KeywordSearchQuery.java | 11 +- .../KeywordSearchQueryManager.java | 11 +- .../KeywordSearchResultFactory.java | 97 ++++--- .../autopsy/keywordsearch/LuceneQuery.java | 140 ++++++---- .../autopsy/keywordsearch/MarkupSource.java | 10 +- .../autopsy/keywordsearch/Server.java | 161 +++++++++++- .../keywordsearch/TermComponentQuery.java | 33 +-- 23 files changed, 1582 insertions(+), 304 deletions(-) create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ContentHit.java create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPaging.java create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileExtract.java diff --git a/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java b/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java index f8c336b0b2..7ba0a269a2 100644 --- a/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java +++ b/DataModel/src/org/sleuthkit/autopsy/datamodel/FsContentStringStream.java @@ -43,8 +43,13 @@ public class FsContentStringStream extends InputStream { } }, }; + + //args private FsContent content; private String encoding; + private boolean preserveOnBuffBoundary; + + //internal data private long contentOffset = 0; //offset in fscontent read into curReadBuf private static final int READ_BUF_SIZE = 256; private static final byte[] curReadBuf = new byte[READ_BUF_SIZE]; @@ -64,15 +69,28 @@ public class FsContentStringStream extends InputStream { private static final Logger logger = Logger.getLogger(FsContentStringStream.class.getName()); /** - * + * Construct new string stream from FsContent * @param content to extract strings from - * @param encoding target encoding, current only ASCII supported + * @param encoding target encoding, currently UTF-8 + * @param preserveOnBuffBoundary whether to preserve or split string on a buffer boundary. If false, will pack into read buffer up to max. possible, potentially splitting a string. If false, the string will be preserved for next read. */ - public FsContentStringStream(FsContent content, Encoding encoding) { + public FsContentStringStream(FsContent content, Encoding encoding, boolean preserveOnBuffBoundary) { this.content = content; this.encoding = encoding.toString(); + this.preserveOnBuffBoundary = preserveOnBuffBoundary; //logger.log(Level.INFO, "FILE: " + content.getParentPath() + "/" + content.getName()); } + + /** + * Construct new string stream from FsContent + * Do not attempt to fill entire read buffer if that would break a string + * + * @param content to extract strings from + * @param encoding target encoding, currently UTF-8 + */ + public FsContentStringStream(FsContent content, Encoding encoding) { + this(content, encoding, false); + } @Override public int read(byte[] b, int off, int len) throws IOException { @@ -190,7 +208,7 @@ public class FsContentStringStream extends InputStream { //check if temp still has chars to qualify as a string //we might need to break up temp into 2 parts for next read() call //consume as many as possible to fill entire user buffer - if (tempStringLen >= MIN_PRINTABLE_CHARS) { + if (!this.preserveOnBuffBoundary && tempStringLen >= MIN_PRINTABLE_CHARS) { if (newCurLen > len) { int appendChars = len - curStringLen; //save part for next user read(), need to break up temp string diff --git a/KeywordSearch/release/solr/solr/conf/schema.xml b/KeywordSearch/release/solr/solr/conf/schema.xml index 0436a567a9..5c5c910eb9 100644 --- a/KeywordSearch/release/solr/solr/conf/schema.xml +++ b/KeywordSearch/release/solr/solr/conf/schema.xml @@ -504,7 +504,10 @@ - + + + +