From 0068d3acfdefb46793d3d5a5cffe2ac73e161c44 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Mon, 5 Jun 2023 17:23:56 -0400 Subject: [PATCH] First cut --- .../autopsy/keywordsearch/ExtractedText.java | 270 ++++++++++++++++++ .../keywordsearch/ExtractedTextViewer.java | 122 +++++++- .../KeywordSearchIngestModule.java | 4 +- .../{RawText.java => SolrIndexedText.java} | 24 +- 4 files changed, 393 insertions(+), 27 deletions(-) create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java rename KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/{RawText.java => SolrIndexedText.java} (92%) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java new file mode 100755 index 0000000000..c12b34e93b --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java @@ -0,0 +1,270 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2023 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import com.google.common.io.CharSource; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Level; +import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.coreutils.EscapeUtil; +import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.textextractors.TextExtractor; +import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; +import org.sleuthkit.datamodel.AbstractFile; + +/** + * A "source" for the extracted abstractFile viewer that displays "raw" (not + * highlighted) indexed text for a file or an artifact. + */ +class ExtractedText implements IndexedText { // ELTODO + + private int numPages = 0; + private int currentPage = 0; + private final AbstractFile abstractFile; + private final long objectId; + //keep last abstractFile cached + private String cachedString; + private int cachedChunk; + private Chunker chunker = null; + private static final Logger logger = Logger.getLogger(ExtractedText.class.getName()); + + /** + * Construct a new ExtractedText object for the given content and object id. + * This constructor needs both a content object and an object id because the + * ExtractedText implementation attempts to provide useful messages in the + * text content viewer for (a) the case where a file has not been indexed + * because known files are being skipped and (b) the case where the file + * content has not yet been indexed. + * + * @param file Abstract file. + * @param objectId Either a file id or an artifact id. + */ + ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { + this.abstractFile = file; + this.objectId = objectId; + this.currentPage = 0; // ELTODO + this.numPages = 1; + initialize(); + } + + /** + * Return the ID that this object is associated with -- to help with caching + * + * @return + */ + public long getObjectId() { + return this.objectId; + } + + @Override + public int getCurrentPage() { + return this.currentPage; + } + + @Override + public boolean hasNextPage() { + return true; + } + + @Override + public boolean hasPreviousPage() { + return false; + } + + @Override + public int nextPage() { + if (!hasNextPage()) { + throw new IllegalStateException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg")); + } + ++currentPage; + return currentPage; + } + + @Override + public int previousPage() { + if (!hasPreviousPage()) { + throw new IllegalStateException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg")); + } + --currentPage; + return currentPage; + } + + @Override + public boolean hasNextItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg")); + } + + @Override + public boolean hasPreviousItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg")); + } + + @Override + public int nextItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg")); + } + + @Override + public int previousItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg")); + } + + @Override + public int currentItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg")); + } + + @Override + public String getText() { + try { + return getContentText(currentPage + 1); // ELTODO + } catch (Exception ex) { + logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS + } + return Bundle.IndexedText_errorMessage_errorGettingText(); + } + + @NbBundle.Messages({ + "ExtractedText.FileText=File Text"}) + @Override + public String toString() { + return Bundle.ExtractedText_FileText(); + } + + @Override + public boolean isSearchable() { + return false; + } + + @Override + public String getAnchorPrefix() { + return ""; + } + + @Override + public int getNumberHits() { + return 0; + } + + @Override + public int getNumberPages() { + return numPages; + } + + /** + * Set the internal values, such as pages + */ + private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { + TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); + + Map extractedMetadata = new HashMap<>(); + Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata); + + //Get a reader for the content of the given source + BufferedReader reader = new BufferedReader(sourceReader); + chunker = new Chunker(reader); + } + + /** + * Extract text from abstractFile + * + * @param node a node that has extracted abstractFile + * @param currentPage currently used page + * + * @return the extracted text + */ + private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception { + + // ELTODO + //check if cached + if (cachedString != null) { + if (cachedChunk == currentPage) { + return cachedString; + } + } + + String indexedText; + if (chunker.hasNext()) { + Chunker.Chunk chunk = chunker.next(); + chunk.setChunkId(currentPage); + + if (chunker.hasException()) { + logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException()); + throw chunker.getException(); + } + + indexedText = chunk.toString(); + } else { + return Bundle.IndexedText_errorMessage_errorGettingText(); + } + + cachedString = EscapeUtil.escapeHtml(indexedText).trim(); + StringBuilder sb = new StringBuilder(cachedString.length() + 20); + sb.append("
").append(cachedString).append("
"); //NON-NLS + cachedString = sb.toString(); + cachedChunk = currentPage; + + return cachedString; + } + + private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile, + Map extractedMetadata) throws TextExtractor.InitReaderException { + + Reader fileText = extractor.getReader(); + Reader finalReader; + try { + Map metadata = extractor.getMetadata(); + if (!metadata.isEmpty()) { + // Creating the metadata artifact here causes occasional problems + // when indexing the text, so we save the metadata map to + // use after this method is complete. + extractedMetadata.putAll(metadata); + } + CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata); + //Append the metadata to end of the file text + finalReader = CharSource.concat(new CharSource() { + //Wrap fileText reader for concatenation + @Override + public Reader openStream() throws IOException { + return fileText; + } + }, formattedMetadata).openStream(); + } catch (IOException ex) { + logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]", + aFile.getName(), aFile.getId()), ex); + //Just send file text. + finalReader = fileText; + } + //divide into chunks and index + return finalReader; + + } + +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index aac5757fc0..a33105e4fc 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -26,7 +26,9 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.logging.Level; +import org.apache.tika.mime.MimeTypes; import org.openide.nodes.Node; +import org.openide.util.Exceptions; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.lookup.ServiceProvider; @@ -34,7 +36,11 @@ import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer; import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.ingest.IngestModule; import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult; +import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; +import org.sleuthkit.autopsy.textextractors.TextExtractor; +import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.Account; import org.sleuthkit.datamodel.BlackboardArtifact; @@ -45,6 +51,7 @@ import static org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASS import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Report; import org.sleuthkit.datamodel.TskCoreException; +import org.sleuthkit.datamodel.TskData; /** * A text viewer that displays the indexed text associated with a file or an @@ -61,14 +68,20 @@ public class ExtractedTextViewer implements TextViewer { private ExtractedContentPanel panel; private volatile Node currentNode = null; private IndexedText currentSource = null; + private FileTypeDetector fileTypeDetector = null; /** * Constructs a text viewer that displays the indexed text associated with a * file or an artifact, possibly marked up with HTML to highlight keyword - * hits. + * hits. If text for the Content has not been fully indexed by Solr then + * attempt to extract text using one of text extractors. */ public ExtractedTextViewer() { - // This constructor is intentionally empty. + try { + fileTypeDetector = new FileTypeDetector(); + } catch (FileTypeDetector.FileTypeDetectorInitException ex) { + logger.log(Level.SEVERE, "Failed to initialize FileTypeDetector", ex); //NON-NLS + } } /** @@ -155,8 +168,23 @@ public class ExtractedTextViewer implements TextViewer { */ IndexedText rawContentText = null; if (file != null) { - rawContentText = new RawText(file, file.getId()); - sources.add(rawContentText); + + // see if Solr has fully indexed this file + if (solrHasFullyIndexedContent(file.getId())) { + rawContentText = new SolrIndexedText(file, file.getId()); + sources.add(rawContentText); + } + + // Solr does not have fully indexed content. + // see if it's a file type for which we can extract text + if (ableToExtractTextFromFile(file)) { + try { + rawContentText = new ExtractedText(file, file.getId()); + sources.add(rawContentText); + } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { + // do nothing + } + } } /* @@ -164,7 +192,7 @@ public class ExtractedTextViewer implements TextViewer { * associated with the node. */ if (report != null) { - rawContentText = new RawText(report, report.getId()); + rawContentText = new SolrIndexedText(report, report.getId()); sources.add(rawContentText); } @@ -222,12 +250,11 @@ public class ExtractedTextViewer implements TextViewer { if (attribute != null) { long artifactId = attribute.getValueLong(); BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId); - rawArtifactText = new RawText(associatedArtifact, associatedArtifact.getArtifactID()); - + rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID()); } } else { - rawArtifactText = new RawText(artifact, artifact.getArtifactID()); + rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID()); } } return rawArtifactText; @@ -340,8 +367,18 @@ public class ExtractedTextViewer implements TextViewer { * data source instead of a file. */ AbstractFile file = node.getLookup().lookup(AbstractFile.class); - if (file != null && solrHasContent(file.getId())) { - return true; + if (file != null) { + + // see if Solr has fully indexed this file + if (solrHasFullyIndexedContent(file.getId())) { + return true; + } + + // Solr does not have fully indexed content. + // see if it's a file type for which we can extract text + if (ableToExtractTextFromFile(file)) { + return true; + } } /* @@ -351,7 +388,7 @@ public class ExtractedTextViewer implements TextViewer { * indexed text for the artifact. */ if (artifact != null) { - return solrHasContent(artifact.getArtifactID()); + return solrHasFullyIndexedContent(artifact.getArtifactID()); } /* @@ -361,7 +398,7 @@ public class ExtractedTextViewer implements TextViewer { */ Report report = node.getLookup().lookup(Report.class); if (report != null) { - return solrHasContent(report.getId()); + return solrHasFullyIndexedContent(report.getId()); } /* @@ -397,12 +434,14 @@ public class ExtractedTextViewer implements TextViewer { * * @return true if Solr has content, else false */ - private boolean solrHasContent(Long objectId) { + private boolean solrHasFullyIndexedContent(Long objectId) { final Server solrServer = KeywordSearch.getServer(); if (solrServer.coreIsOpen() == false) { return false; } + // ELTODO get total number of chunks in the file, and verify that + // all of the chunks have been indexed. try { return solrServer.queryIsIndexed(objectId); } catch (NoOpenCoreException | KeywordSearchModuleException ex) { @@ -411,6 +450,63 @@ public class ExtractedTextViewer implements TextViewer { } } + /** + * Check if we can extract text for this file type. + * + * @param file Abstract File + * + * @return true if text can be extracted from file, else false + */ + private boolean ableToExtractTextFromFile(AbstractFile file) { + + TskData.TSK_DB_FILES_TYPE_ENUM fileType = file.getType(); + + if (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) { + return false; + } + + /** + * Extract unicode strings from unallocated and unused blocks and carved + * text files. The reason for performing string extraction on these is + * because they all may contain multiple encodings which can cause text + * to be missed by the more specialized text extractors. + */ + if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) + || fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)) + || (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) { + return false; + } + + final long size = file.getSize(); + //if not to index content, or a dir, or 0 content, index meta data only + + if (file.isDir() || size == 0) { + return false; + } + + // ELTODO do we need to skip text files here? probably not. + if (file.getNameExtension().equalsIgnoreCase("txt")) { + return false; + } + + // ELTODO do we need to skip known files here? probably not. + if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) { + return false; + } + + String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase(); + + if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) { + return false; + } + + if (MimeTypes.OCTET_STREAM.equals(mimeType)) { + return false; + } + + return true; + } + /** * Listener to select the next match found in the text */ diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index e3f9582fdf..782f966616 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -96,7 +96,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { * generally text extractors should ignore archives and let unpacking * modules take care of them */ - private static final List ARCHIVE_MIME_TYPES + static final List ARCHIVE_MIME_TYPES = ImmutableList.of( //ignore unstructured binary and compressed data, for which string extraction or unzipper works better "application/x-7z-compressed", //NON-NLS @@ -683,7 +683,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { @NbBundle.Messages({ "KeywordSearchIngestModule.metadataTitle=METADATA" }) - private CharSource getMetaDataCharSource(Map metadata) { + static CharSource getMetaDataCharSource(Map metadata) { return CharSource.wrap(new StringBuilder( String.format("\n\n------------------------------%s------------------------------\n\n", Bundle.KeywordSearchIngestModule_metadataTitle())) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java similarity index 92% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java rename to KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java index 789de3fd50..6745e0c5d7 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2018 Basis Technology Corp. + * Copyright 2011-2023 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -30,9 +30,9 @@ import org.sleuthkit.datamodel.TskData; /** * A "source" for the extracted content viewer that displays "raw" (not - * highlighted) indexed text for a file or an artifact. + * highlighted) Solr indexed text for a file or an artifact. */ -class RawText implements IndexedText { +class SolrIndexedText implements IndexedText { private int numPages = 0; private int currentPage = 0; @@ -43,12 +43,12 @@ class RawText implements IndexedText { //keep last content cached private String cachedString; private int cachedChunk; - private static final Logger logger = Logger.getLogger(RawText.class.getName()); + private static final Logger logger = Logger.getLogger(SolrIndexedText.class.getName()); /** - * Construct a new RawText object for the given content and object id. This + * Construct a new SolrIndexedText object for the given content and object id. This * constructor needs both a content object and an object id because the - * RawText implementation attempts to provide useful messages in the text + * SolrIndexedText implementation attempts to provide useful messages in the text * content viewer for (a) the case where a file has not been indexed because * known files are being skipped and (b) the case where the file content has * not yet been indexed. @@ -56,14 +56,14 @@ class RawText implements IndexedText { * @param content Used to get access to file names and "known" status. * @param objectId Either a file id or an artifact id. */ - RawText(Content content, long objectId) { + SolrIndexedText(Content content, long objectId) { this.content = content; this.blackboardArtifact = null; this.objectId = objectId; initialize(); } - RawText(BlackboardArtifact bba, long objectId) { + SolrIndexedText(BlackboardArtifact bba, long objectId) { this.content = null; this.blackboardArtifact = bba; this.objectId = objectId; @@ -159,14 +159,14 @@ class RawText implements IndexedText { } @NbBundle.Messages({ - "RawText.FileText=File Text", - "RawText.ResultText=Result Text"}) + "SolrIndexedText.FileText=File Text", + "SolrIndexedText.ResultText=Result Text"}) @Override public String toString() { if (null != content) { - return Bundle.RawText_FileText(); + return Bundle.SolrIndexedText_FileText(); } else { - return Bundle.RawText_ResultText(); + return Bundle.SolrIndexedText_ResultText(); } }