From d5e7c520c969d294fddc6a48ecfe05ae1c0c7787 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Wed, 30 Nov 2016 10:48:30 +0100 Subject: [PATCH 01/21] RegressionTest sets datasource time zone to "America/New_York" --- .../src/org/sleuthkit/autopsy/testing/RegressionTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java index 12e04aad94..33d73c2044 100755 --- a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java +++ b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java @@ -35,7 +35,6 @@ import java.util.logging.Level; import java.util.logging.Logger; import javax.imageio.ImageIO; import javax.swing.JDialog; -import javax.swing.JLabel; import javax.swing.JTextField; import junit.framework.Test; import junit.framework.TestCase; @@ -50,10 +49,10 @@ import org.netbeans.jemmy.operators.JComboBoxOperator; import org.netbeans.jemmy.operators.JDialogOperator; import org.netbeans.jemmy.operators.JFileChooserOperator; import org.netbeans.jemmy.operators.JLabelOperator; +import org.netbeans.jemmy.operators.JListOperator; import org.netbeans.jemmy.operators.JTabbedPaneOperator; import org.netbeans.jemmy.operators.JTableOperator; import org.netbeans.jemmy.operators.JTextFieldOperator; -import org.netbeans.jemmy.operators.JListOperator; import org.netbeans.junit.NbModuleSuite; import org.sleuthkit.autopsy.ingest.IngestManager; @@ -186,6 +185,8 @@ public class RegressionTest extends TestCase { String img_path = getEscapedPath(System.getProperty("img_path")); String imageDir = img_path; ((JTextField) jtfo0.getSource()).setText(imageDir); + JComboBoxOperator comboBoxOperator = new JComboBoxOperator(wo, 1); + comboBoxOperator.setSelectedItem("(GMT-5:00) America/New_York"); wo.btNext().clickMouse(); } From 0f1f8b22116a1e388d3fa69b8cce350a06497169 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Mon, 12 Dec 2016 15:41:24 +0100 Subject: [PATCH 02/21] refactor common chunking algorithm into TextExtractorBase, remove AbstractFileChunk --- .../keywordsearch/AbstractFileChunk.java | 91 --- .../keywordsearch/ByteContentStream.java | 18 +- .../keywordsearch/HtmlTextExtractor.java | 295 +++++----- .../autopsy/keywordsearch/Ingester.java | 171 ++---- .../KeywordSearchIngestModule.java | 4 +- .../keywordsearch/StringsTextExtractor.java | 553 ++++++++++++++---- .../autopsy/keywordsearch/TextExtractor.java | 63 +- .../keywordsearch/TextExtractorBase.java | 149 +++++ .../keywordsearch/TikaTextExtractor.java | 296 ++-------- 9 files changed, 850 insertions(+), 790 deletions(-) delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractorBase.java diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java deleted file mode 100644 index 5253e5e240..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileChunk.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2011-2016 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; - -import java.nio.charset.Charset; -import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; - -/** - * A representation of a chunk of text from a file that can be used, when - * supplied with an Ingester, to index the chunk for search. - */ -final class AbstractFileChunk { - - private final int chunkNumber; - private final TextExtractor textExtractor; - - /** - * Constructs a representation of a chunk of text from a file that can be - * used, when supplied with an Ingester, to index the chunk for search. - * - * @param textExtractor A TextExtractor for the file. - * @param chunkNumber A sequence number for the chunk. - */ - AbstractFileChunk(TextExtractor textExtractor, int chunkNumber) { - this.textExtractor = textExtractor; - this.chunkNumber = chunkNumber; - } - - /** - * Gets the TextExtractor for the source file of the text chunk. - * - * @return A reference to the TextExtractor. - */ - TextExtractor getTextExtractor() { - return textExtractor; - } - - /** - * Gets the sequence number of the text chunk. - * - * @return The chunk number. - */ - int getChunkNumber() { - return chunkNumber; - } - - /** - * Gets the id of the text chunk. - * - * @return An id of the form [source file object id]_[chunk number] - */ - String getChunkId() { - return Server.getChunkIdString(this.textExtractor.getSourceFile().getId(), this.chunkNumber); - } - - /** - * Indexes the text chunk. - * - * @param ingester An Ingester to do the indexing. - * @param chunkBytes The raw bytes of the text chunk. - * @param chunkSize The size of the text chunk in bytes. - * @param charSet The char set to use during indexing. - * - * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException - */ - void index(Ingester ingester, byte[] chunkBytes, long chunkSize, Charset charSet) throws IngesterException { - ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, textExtractor.getSourceFile(), charSet); - try { - ingester.ingest(this, bcs, chunkBytes.length); - } catch (Exception ex) { - throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", getChunkId()), ex); - } - } - -} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java index d5a19712c0..c39e9b7bb5 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ByteContentStream.java @@ -23,11 +23,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.nio.charset.Charset; - +import org.apache.solr.common.util.ContentStream; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; -import org.apache.solr.common.util.ContentStream; import org.sleuthkit.datamodel.AbstractContent; /** @@ -37,19 +35,17 @@ import org.sleuthkit.datamodel.AbstractContent; class ByteContentStream implements ContentStream { //input - private byte[] content; //extracted subcontent + private final byte[] content; //extracted subcontent private long contentSize; - private AbstractContent aContent; //origin - private Charset charset; //output byte stream charset of encoded strings + private final AbstractContent aContent; //origin - private InputStream stream; + private final InputStream stream; - private static Logger logger = Logger.getLogger(ByteContentStream.class.getName()); + private static final Logger logger = Logger.getLogger(ByteContentStream.class.getName()); - public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent, Charset charset) { + public ByteContentStream(byte[] content, long contentSize, AbstractContent aContent) { this.content = content; this.aContent = aContent; - this.charset = charset; stream = new ByteArrayInputStream(content, 0, (int) contentSize); } @@ -63,7 +59,7 @@ class ByteContentStream implements ContentStream { @Override public String getContentType() { - return "text/plain;charset=" + charset.name(); //NON-NLS + return "text/plain;charset=" + Server.DEFAULT_INDEXED_TEXT_CHARSET.name(); //NON-NLS } @Override diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java index 27e9ccd637..6e8a57e258 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2012-2013 Basis Technology Corp. + * Copyright 2011-2016 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,15 +21,16 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.InputStream; import java.io.Reader; -import java.nio.charset.Charset; +import java.io.StringReader; import java.util.Arrays; import java.util.List; -import java.util.Map; import java.util.logging.Level; +import net.htmlparser.jericho.Attributes; +import net.htmlparser.jericho.Renderer; +import net.htmlparser.jericho.Source; +import net.htmlparser.jericho.StartTag; +import net.htmlparser.jericho.StartTagType; import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; -import org.sleuthkit.autopsy.ingest.IngestJobContext; -import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -38,19 +39,12 @@ import org.sleuthkit.datamodel.ReadContentInputStream; * divided into chunks and indexed with Solr. If HTML extraction succeeds, * chunks are indexed with Solr. */ -class HtmlTextExtractor implements TextExtractor { +class HtmlTextExtractor extends TextExtractorBase { private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); - private static Ingester ingester; - static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET; + static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; - private static final int SINGLE_READ_CHARS = 1024; - private static final int EXTRA_CHARS = 128; //for whitespace private static final int MAX_SIZE = 50000000; - //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM - private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; - private AbstractFile sourceFile; - private int numChunks = 0; static final List WEB_MIME_TYPES = Arrays.asList( "application/javascript", //NON-NLS @@ -59,154 +53,14 @@ class HtmlTextExtractor implements TextExtractor { "text/css", //NON-NLS "text/html", //NON-NLS NON-NLS "text/javascript" //NON-NLS - //"application/xml", - //"application/xml-dtd", ); HtmlTextExtractor() { - ingester = Server.getIngester(); } @Override - public boolean setScripts(List tags + scripts.append(tag.getElement().getContent()).append("\n"); + + } else if (tag.getName().equals("a")) { + //NON-NLS + numLinks++; + links.append(numLinks).append(") "); + links.append(tag.getTagContent()).append("\n"); + + } else if (tag.getName().equals("img")) { + //NON-NLS + numImages++; + images.append(numImages).append(") "); + images.append(tag.getTagContent()).append("\n"); + + } else if (tag.getTagType().equals(StartTagType.COMMENT)) { + numComments++; + comments.append(numComments).append(") "); + comments.append(tag.getTagContent()).append("\n"); + + } else { + // Make sure it has an attribute + Attributes atts = tag.getAttributes(); + if (atts != null && atts.length() > 0) { + numOthers++; + others.append(numOthers).append(") "); + others.append(tag.getName()).append(":"); + others.append(tag.getTagContent()).append("\n"); + + } + } + } + stringBuilder.append(text).append("\n\n"); + stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS + if (numScripts > 0) { + stringBuilder.append("---Scripts---\n"); //NON-NLS + stringBuilder.append(scripts).append("\n"); + } + if (numLinks > 0) { + stringBuilder.append("---Links---\n"); //NON-NLS + stringBuilder.append(links).append("\n"); + } + if (numImages > 0) { + stringBuilder.append("---Images---\n"); //NON-NLS + stringBuilder.append(images).append("\n"); + } + if (numComments > 0) { + stringBuilder.append("---Comments---\n"); //NON-NLS + stringBuilder.append(comments).append("\n"); + } + if (numOthers > 0) { + stringBuilder.append("---Others---\n"); //NON-NLS + stringBuilder.append(others).append("\n"); + } + // All done, now make it a reader + return new StringReader(stringBuilder.toString()); + } catch (IOException ex) { + throw new Ingester.IngesterException("Error extracting HTML from content.", ex); + } + } + + @Override + Void newAppendixProvider() { + return null; + } + + InputStream getInputStream(AbstractFile sourceFile1) { + return new ReadContentInputStream(sourceFile1); + } + + @Override + boolean noExtractionOptionsAreEnabled() { + return false; } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 256d4508f2..3305a28a8d 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -22,13 +22,13 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; -import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; import java.util.logging.Level; import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.util.ContentStream; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.TextUtil; @@ -42,7 +42,6 @@ import org.sleuthkit.datamodel.Directory; import org.sleuthkit.datamodel.File; import org.sleuthkit.datamodel.LayoutFile; import org.sleuthkit.datamodel.LocalFile; -import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.SlackFile; import org.sleuthkit.datamodel.TskCoreException; @@ -54,13 +53,12 @@ class Ingester { private static final Logger logger = Logger.getLogger(Ingester.class.getName()); private volatile boolean uncommitedIngests = false; private final Server solrServer = KeywordSearch.getServer(); - private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV(); + private static final GetContentFieldsV getContentFieldsV = new GetContentFieldsV(); private static Ingester instance; //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika) //TODO use a streaming way to add content to /update handler private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024; - private static final String ENCODING = "UTF-8"; //NON-NLS private Ingester() { } @@ -84,60 +82,22 @@ class Ingester { } /** - * Sends a stream to Solr to have its content extracted and added to the - * index. commit() should be called once you're done ingesting files. + * Indexes the text chunk. * - * @param afscs File AbstractFileStringContentStream to ingest + * @param ingester An Ingester to do the indexing. + * @param chunkBytes The raw bytes of the text chunk. + * @param chunkSize The size of the text chunk in bytes. * - * @throws IngesterException if there was an error processing a specific - * file, but the Solr server is probably fine. + * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - void ingest(AbstractFileStringContentStream afscs) throws IngesterException { - Map params = getContentFields(afscs.getSourceContent()); - ingest(afscs, params, afscs.getSourceContent().getSize()); - } - - /** - * Sends a TextExtractor to Solr to have its content extracted and added to - * the index. commit() should be called once you're done ingesting files. - * FileExtract represents a parent of extracted file with actual content. - * The parent itself has no content, only meta data and is used to associate - * the extracted AbstractFileChunk - * - * @param fe TextExtractor to ingest - * - * @throws IngesterException if there was an error processing a specific - * file, but the Solr server is probably fine. - */ - void ingest(TextExtractor fe) throws IngesterException { - Map params = getContentFields(fe.getSourceFile()); - - params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks())); - - ingest(new NullContentStream(fe.getSourceFile()), params, 0); - } - - /** - * Sends a AbstractFileChunk to Solr and its extracted content stream to be - * added to the index. commit() should be called once you're done ingesting - * files. AbstractFileChunk represents a file chunk and its chunk content. - * - * @param fec AbstractFileChunk to ingest - * @param size approx. size of the stream in bytes, used for timeout - * estimation - * - * @throws IngesterException if there was an error processing a specific - * file, but the Solr server is probably fine. - */ - void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException { - AbstractContent sourceContent = bcs.getSourceContent(); - Map params = getContentFields(sourceContent); - - //overwrite id with the chunk id - params.put(Server.Schema.ID.toString(), - Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber())); - - ingest(bcs, params, size); + void indexChunk(AbstractFile chunkSource, byte[] chunkBytes, long chunkSize, String chunkID) throws IngesterException { + ByteContentStream bcs = new ByteContentStream(chunkBytes, chunkSize, chunkSource); + Map fields = getContentFields(chunkSource); + try { + ingest(bcs, fields, chunkBytes.length); + } catch (Exception ex) { + throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkID), ex); + } } /** @@ -153,12 +113,25 @@ class Ingester { * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. */ - void ingest(AbstractFile file, boolean ingestContent) throws IngesterException { - if (ingestContent == false || file.isDir()) { - ingest(new NullContentStream(file), getContentFields(file), 0); - } else { - ingest(new FscContentStream(file), getContentFields(file), file.getSize()); - } + void indexMetaDataOnly(AbstractFile file) throws IngesterException { + ingest(new NullContentStream(file), getContentFields(file), 0); + } + /** + * Sends a TextExtractor to Solr to have its content extracted and added to + * the index. commit() should be called once you're done ingesting files. + * FileExtract represents a parent of extracted file with actual content. + * The parent itself has no content, only meta data and is used to associate + * the extracted AbstractFileChunk + * + * @param fe TextExtractor to ingest + * + * @throws IngesterException if there was an error processing a specific + * file, but the Solr server is probably fine. + */ + void recordNumberOfChunks(AbstractFile file, int numChunks) throws IngesterException { + Map params = getContentFields(file); + params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks)); + ingest(new NullContentStream(file), params, 0); } /** @@ -168,14 +141,14 @@ class Ingester { * * @return the map */ - private Map getContentFields(AbstractContent fsc) { + Map getContentFields(AbstractContent fsc) { return fsc.accept(getContentFieldsV); } /** * Visitor used to create param list to send to SOLR index. */ - private class GetContentFieldsV extends ContentVisitor.Default> { + static private class GetContentFieldsV extends ContentVisitor.Default> { @Override protected Map defaultVisit(Content cntnt) { @@ -304,25 +277,19 @@ class Ingester { if (read != 0) { String s = ""; - try { - s = new String(docChunkContentBuf, 0, read, ENCODING); - // Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index - char[] chars = null; - for (int i = 0; i < s.length(); i++) { - if (!TextUtil.isValidSolrUTF8(s.charAt(i))) { - // only convert string to char[] if there is a non-UTF8 character - if (chars == null) { - chars = s.toCharArray(); - } - chars[i] = '^'; + s = new String(docChunkContentBuf, 0, read, StandardCharsets.UTF_8); + char[] chars = null; + for (int i = 0; i < s.length(); i++) { + if (!TextUtil.isValidSolrUTF8(s.charAt(i))) { + // only convert string to char[] if there is a non-UTF8 character + if (chars == null) { + chars = s.toCharArray(); } + chars[i] = '^'; } - // check if the string was modified (i.e. there was a non-UTF8 character found) - if (chars != null) { - s = new String(chars); - } - } catch (UnsupportedEncodingException ex) { - logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS + } + if (chars != null) { + s = new String(chars); } updateDoc.addField(Server.Schema.CONTENT.toString(), s); } else { @@ -380,48 +347,6 @@ class Ingester { } } - /** - * ContentStream to read() the data from a FsContent object - */ - private static class FscContentStream implements ContentStream { - - private AbstractFile f; - - FscContentStream(AbstractFile f) { - this.f = f; - } - - @Override - public String getName() { - return f.getName(); - } - - @Override - public String getSourceInfo() { - return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId()); - } - - @Override - public String getContentType() { - return null; - } - - @Override - public Long getSize() { - return f.getSize(); - } - - @Override - public InputStream getStream() throws IOException { - return new ReadContentInputStream(f); - } - - @Override - public Reader getReader() throws IOException { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader")); - } - } /** * ContentStream associated with FsContent, but forced with no content diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index c0ced07107..c3b997fa58 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -512,7 +512,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { if (context.fileIngestIsCancelled()) { return; } - ingester.ingest(aFile, false); //meta-data only + ingester.indexMetaDataOnly(aFile); putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED); } catch (IngesterException ex) { putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING); @@ -539,7 +539,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { if (context.fileIngestIsCancelled()) { return; } - ingester.ingest(aFile, false); //meta-data only + ingester.indexMetaDataOnly(aFile); putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED); } catch (IngesterException ex) { putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java index 3bbc97dcfc..93c6c786fa 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2014 Basis Technology Corp. + * Copyright 2011-2016 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,155 +20,119 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.InputStream; -import java.nio.charset.Charset; +import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; -import org.sleuthkit.autopsy.ingest.IngestJobContext; -import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; +import org.sleuthkit.datamodel.TskCoreException; +import org.sleuthkit.datamodel.TskException; /** - * Takes an AbstractFile, extract strings, converts into chunks (associated with - * the original source file) up to 1MB then and indexes chunks as text with Solr + * Takes an AbstractFile, extracts strings, converts into chunks (associated + * with the original source file) up to 1MB then and indexes chunks as text with + * Solr. */ -class StringsTextExtractor implements TextExtractor { +class StringsTextExtractor extends TextExtractorBase { - private static Ingester ingester; private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; - //private static final int BOM_LEN = 3; - private static final int BOM_LEN = 0; //disabled prepending of BOM - private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; - private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2; - private AbstractFile sourceFile; - private int numChunks = 0; private final List