First cut

This commit is contained in:
eugene.livis 2023-06-05 17:23:56 -04:00
parent 99e08e8dbe
commit 0068d3acfd
4 changed files with 393 additions and 27 deletions

View File

@ -0,0 +1,270 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2023 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import com.google.common.io.CharSource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.AbstractFile;
/**
* A "source" for the extracted abstractFile viewer that displays "raw" (not
* highlighted) indexed text for a file or an artifact.
*/
class ExtractedText implements IndexedText { // ELTODO
private int numPages = 0;
private int currentPage = 0;
private final AbstractFile abstractFile;
private final long objectId;
//keep last abstractFile cached
private String cachedString;
private int cachedChunk;
private Chunker chunker = null;
private static final Logger logger = Logger.getLogger(ExtractedText.class.getName());
/**
* Construct a new ExtractedText object for the given content and object id.
* This constructor needs both a content object and an object id because the
* ExtractedText implementation attempts to provide useful messages in the
* text content viewer for (a) the case where a file has not been indexed
* because known files are being skipped and (b) the case where the file
* content has not yet been indexed.
*
* @param file Abstract file.
* @param objectId Either a file id or an artifact id.
*/
ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
this.abstractFile = file;
this.objectId = objectId;
this.currentPage = 0; // ELTODO
this.numPages = 1;
initialize();
}
/**
* Return the ID that this object is associated with -- to help with caching
*
* @return
*/
public long getObjectId() {
return this.objectId;
}
@Override
public int getCurrentPage() {
return this.currentPage;
}
@Override
public boolean hasNextPage() {
return true;
}
@Override
public boolean hasPreviousPage() {
return false;
}
@Override
public int nextPage() {
if (!hasNextPage()) {
throw new IllegalStateException(
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg"));
}
++currentPage;
return currentPage;
}
@Override
public int previousPage() {
if (!hasPreviousPage()) {
throw new IllegalStateException(
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg"));
}
--currentPage;
return currentPage;
}
@Override
public boolean hasNextItem() {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg"));
}
@Override
public boolean hasPreviousItem() {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg"));
}
@Override
public int nextItem() {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg"));
}
@Override
public int previousItem() {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg"));
}
@Override
public int currentItem() {
throw new UnsupportedOperationException(
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg"));
}
@Override
public String getText() {
try {
return getContentText(currentPage + 1); // ELTODO
} catch (Exception ex) {
logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS
}
return Bundle.IndexedText_errorMessage_errorGettingText();
}
@NbBundle.Messages({
"ExtractedText.FileText=File Text"})
@Override
public String toString() {
return Bundle.ExtractedText_FileText();
}
@Override
public boolean isSearchable() {
return false;
}
@Override
public String getAnchorPrefix() {
return "";
}
@Override
public int getNumberHits() {
return 0;
}
@Override
public int getNumberPages() {
return numPages;
}
/**
* Set the internal values, such as pages
*/
private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
Map<String, String> extractedMetadata = new HashMap<>();
Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
//Get a reader for the content of the given source
BufferedReader reader = new BufferedReader(sourceReader);
chunker = new Chunker(reader);
}
/**
* Extract text from abstractFile
*
* @param node a node that has extracted abstractFile
* @param currentPage currently used page
*
* @return the extracted text
*/
private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception {
// ELTODO
//check if cached
if (cachedString != null) {
if (cachedChunk == currentPage) {
return cachedString;
}
}
String indexedText;
if (chunker.hasNext()) {
Chunker.Chunk chunk = chunker.next();
chunk.setChunkId(currentPage);
if (chunker.hasException()) {
logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
throw chunker.getException();
}
indexedText = chunk.toString();
} else {
return Bundle.IndexedText_errorMessage_errorGettingText();
}
cachedString = EscapeUtil.escapeHtml(indexedText).trim();
StringBuilder sb = new StringBuilder(cachedString.length() + 20);
sb.append("<pre>").append(cachedString).append("</pre>"); //NON-NLS
cachedString = sb.toString();
cachedChunk = currentPage;
return cachedString;
}
private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
Reader fileText = extractor.getReader();
Reader finalReader;
try {
Map<String, String> metadata = extractor.getMetadata();
if (!metadata.isEmpty()) {
// Creating the metadata artifact here causes occasional problems
// when indexing the text, so we save the metadata map to
// use after this method is complete.
extractedMetadata.putAll(metadata);
}
CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
//Append the metadata to end of the file text
finalReader = CharSource.concat(new CharSource() {
//Wrap fileText reader for concatenation
@Override
public Reader openStream() throws IOException {
return fileText;
}
}, formattedMetadata).openStream();
} catch (IOException ex) {
logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
aFile.getName(), aFile.getId()), ex);
//Just send file text.
finalReader = fileText;
}
//divide into chunks and index
return finalReader;
}
}

View File

@ -26,7 +26,9 @@ import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.logging.Level; import java.util.logging.Level;
import org.apache.tika.mime.MimeTypes;
import org.openide.nodes.Node; import org.openide.nodes.Node;
import org.openide.util.Exceptions;
import org.openide.util.Lookup; import org.openide.util.Lookup;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.lookup.ServiceProvider; import org.openide.util.lookup.ServiceProvider;
@ -34,7 +36,11 @@ import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer; import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.ingest.IngestModule;
import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult; import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Account; import org.sleuthkit.datamodel.Account;
import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.BlackboardArtifact;
@ -45,6 +51,7 @@ import static org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASS
import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.Report; import org.sleuthkit.datamodel.Report;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskData;
/** /**
* A text viewer that displays the indexed text associated with a file or an * A text viewer that displays the indexed text associated with a file or an
@ -61,14 +68,20 @@ public class ExtractedTextViewer implements TextViewer {
private ExtractedContentPanel panel; private ExtractedContentPanel panel;
private volatile Node currentNode = null; private volatile Node currentNode = null;
private IndexedText currentSource = null; private IndexedText currentSource = null;
private FileTypeDetector fileTypeDetector = null;
/** /**
* Constructs a text viewer that displays the indexed text associated with a * Constructs a text viewer that displays the indexed text associated with a
* file or an artifact, possibly marked up with HTML to highlight keyword * file or an artifact, possibly marked up with HTML to highlight keyword
* hits. * hits. If text for the Content has not been fully indexed by Solr then
* attempt to extract text using one of text extractors.
*/ */
public ExtractedTextViewer() { public ExtractedTextViewer() {
// This constructor is intentionally empty. try {
fileTypeDetector = new FileTypeDetector();
} catch (FileTypeDetector.FileTypeDetectorInitException ex) {
logger.log(Level.SEVERE, "Failed to initialize FileTypeDetector", ex); //NON-NLS
}
} }
/** /**
@ -155,16 +168,31 @@ public class ExtractedTextViewer implements TextViewer {
*/ */
IndexedText rawContentText = null; IndexedText rawContentText = null;
if (file != null) { if (file != null) {
rawContentText = new RawText(file, file.getId());
// see if Solr has fully indexed this file
if (solrHasFullyIndexedContent(file.getId())) {
rawContentText = new SolrIndexedText(file, file.getId());
sources.add(rawContentText); sources.add(rawContentText);
} }
// Solr does not have fully indexed content.
// see if it's a file type for which we can extract text
if (ableToExtractTextFromFile(file)) {
try {
rawContentText = new ExtractedText(file, file.getId());
sources.add(rawContentText);
} catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
// do nothing
}
}
}
/* /*
* Add the "raw" (not highlighted) text, if any, for any report * Add the "raw" (not highlighted) text, if any, for any report
* associated with the node. * associated with the node.
*/ */
if (report != null) { if (report != null) {
rawContentText = new RawText(report, report.getId()); rawContentText = new SolrIndexedText(report, report.getId());
sources.add(rawContentText); sources.add(rawContentText);
} }
@ -222,12 +250,11 @@ public class ExtractedTextViewer implements TextViewer {
if (attribute != null) { if (attribute != null) {
long artifactId = attribute.getValueLong(); long artifactId = attribute.getValueLong();
BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId); BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId);
rawArtifactText = new RawText(associatedArtifact, associatedArtifact.getArtifactID()); rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID());
} }
} else { } else {
rawArtifactText = new RawText(artifact, artifact.getArtifactID()); rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID());
} }
} }
return rawArtifactText; return rawArtifactText;
@ -340,10 +367,20 @@ public class ExtractedTextViewer implements TextViewer {
* data source instead of a file. * data source instead of a file.
*/ */
AbstractFile file = node.getLookup().lookup(AbstractFile.class); AbstractFile file = node.getLookup().lookup(AbstractFile.class);
if (file != null && solrHasContent(file.getId())) { if (file != null) {
// see if Solr has fully indexed this file
if (solrHasFullyIndexedContent(file.getId())) {
return true; return true;
} }
// Solr does not have fully indexed content.
// see if it's a file type for which we can extract text
if (ableToExtractTextFromFile(file)) {
return true;
}
}
/* /*
* If the lookup of the node contains an artifact that is neither a * If the lookup of the node contains an artifact that is neither a
* keyword hit artifact nor a credit card account artifact, and the * keyword hit artifact nor a credit card account artifact, and the
@ -351,7 +388,7 @@ public class ExtractedTextViewer implements TextViewer {
* indexed text for the artifact. * indexed text for the artifact.
*/ */
if (artifact != null) { if (artifact != null) {
return solrHasContent(artifact.getArtifactID()); return solrHasFullyIndexedContent(artifact.getArtifactID());
} }
/* /*
@ -361,7 +398,7 @@ public class ExtractedTextViewer implements TextViewer {
*/ */
Report report = node.getLookup().lookup(Report.class); Report report = node.getLookup().lookup(Report.class);
if (report != null) { if (report != null) {
return solrHasContent(report.getId()); return solrHasFullyIndexedContent(report.getId());
} }
/* /*
@ -397,12 +434,14 @@ public class ExtractedTextViewer implements TextViewer {
* *
* @return true if Solr has content, else false * @return true if Solr has content, else false
*/ */
private boolean solrHasContent(Long objectId) { private boolean solrHasFullyIndexedContent(Long objectId) {
final Server solrServer = KeywordSearch.getServer(); final Server solrServer = KeywordSearch.getServer();
if (solrServer.coreIsOpen() == false) { if (solrServer.coreIsOpen() == false) {
return false; return false;
} }
// ELTODO get total number of chunks in the file, and verify that
// all of the chunks have been indexed.
try { try {
return solrServer.queryIsIndexed(objectId); return solrServer.queryIsIndexed(objectId);
} catch (NoOpenCoreException | KeywordSearchModuleException ex) { } catch (NoOpenCoreException | KeywordSearchModuleException ex) {
@ -411,6 +450,63 @@ public class ExtractedTextViewer implements TextViewer {
} }
} }
/**
* Check if we can extract text for this file type.
*
* @param file Abstract File
*
* @return true if text can be extracted from file, else false
*/
private boolean ableToExtractTextFromFile(AbstractFile file) {
TskData.TSK_DB_FILES_TYPE_ENUM fileType = file.getType();
if (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
return false;
}
/**
* Extract unicode strings from unallocated and unused blocks and carved
* text files. The reason for performing string extraction on these is
* because they all may contain multiple encodings which can cause text
* to be missed by the more specialized text extractors.
*/
if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
|| fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
|| (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) {
return false;
}
final long size = file.getSize();
//if not to index content, or a dir, or 0 content, index meta data only
if (file.isDir() || size == 0) {
return false;
}
// ELTODO do we need to skip text files here? probably not.
if (file.getNameExtension().equalsIgnoreCase("txt")) {
return false;
}
// ELTODO do we need to skip known files here? probably not.
if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) {
return false;
}
String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase();
if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) {
return false;
}
if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
return false;
}
return true;
}
/** /**
* Listener to select the next match found in the text * Listener to select the next match found in the text
*/ */

View File

@ -96,7 +96,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* generally text extractors should ignore archives and let unpacking * generally text extractors should ignore archives and let unpacking
* modules take care of them * modules take care of them
*/ */
private static final List<String> ARCHIVE_MIME_TYPES static final List<String> ARCHIVE_MIME_TYPES
= ImmutableList.of( = ImmutableList.of(
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
"application/x-7z-compressed", //NON-NLS "application/x-7z-compressed", //NON-NLS
@ -683,7 +683,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
@NbBundle.Messages({ @NbBundle.Messages({
"KeywordSearchIngestModule.metadataTitle=METADATA" "KeywordSearchIngestModule.metadataTitle=METADATA"
}) })
private CharSource getMetaDataCharSource(Map<String, String> metadata) { static CharSource getMetaDataCharSource(Map<String, String> metadata) {
return CharSource.wrap(new StringBuilder( return CharSource.wrap(new StringBuilder(
String.format("\n\n------------------------------%s------------------------------\n\n", String.format("\n\n------------------------------%s------------------------------\n\n",
Bundle.KeywordSearchIngestModule_metadataTitle())) Bundle.KeywordSearchIngestModule_metadataTitle()))

View File

@ -1,7 +1,7 @@
/* /*
* Autopsy Forensic Browser * Autopsy Forensic Browser
* *
* Copyright 2011-2018 Basis Technology Corp. * Copyright 2011-2023 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org * Contact: carrier <at> sleuthkit <dot> org
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -30,9 +30,9 @@ import org.sleuthkit.datamodel.TskData;
/** /**
* A "source" for the extracted content viewer that displays "raw" (not * A "source" for the extracted content viewer that displays "raw" (not
* highlighted) indexed text for a file or an artifact. * highlighted) Solr indexed text for a file or an artifact.
*/ */
class RawText implements IndexedText { class SolrIndexedText implements IndexedText {
private int numPages = 0; private int numPages = 0;
private int currentPage = 0; private int currentPage = 0;
@ -43,12 +43,12 @@ class RawText implements IndexedText {
//keep last content cached //keep last content cached
private String cachedString; private String cachedString;
private int cachedChunk; private int cachedChunk;
private static final Logger logger = Logger.getLogger(RawText.class.getName()); private static final Logger logger = Logger.getLogger(SolrIndexedText.class.getName());
/** /**
* Construct a new RawText object for the given content and object id. This * Construct a new SolrIndexedText object for the given content and object id. This
* constructor needs both a content object and an object id because the * constructor needs both a content object and an object id because the
* RawText implementation attempts to provide useful messages in the text * SolrIndexedText implementation attempts to provide useful messages in the text
* content viewer for (a) the case where a file has not been indexed because * content viewer for (a) the case where a file has not been indexed because
* known files are being skipped and (b) the case where the file content has * known files are being skipped and (b) the case where the file content has
* not yet been indexed. * not yet been indexed.
@ -56,14 +56,14 @@ class RawText implements IndexedText {
* @param content Used to get access to file names and "known" status. * @param content Used to get access to file names and "known" status.
* @param objectId Either a file id or an artifact id. * @param objectId Either a file id or an artifact id.
*/ */
RawText(Content content, long objectId) { SolrIndexedText(Content content, long objectId) {
this.content = content; this.content = content;
this.blackboardArtifact = null; this.blackboardArtifact = null;
this.objectId = objectId; this.objectId = objectId;
initialize(); initialize();
} }
RawText(BlackboardArtifact bba, long objectId) { SolrIndexedText(BlackboardArtifact bba, long objectId) {
this.content = null; this.content = null;
this.blackboardArtifact = bba; this.blackboardArtifact = bba;
this.objectId = objectId; this.objectId = objectId;
@ -159,14 +159,14 @@ class RawText implements IndexedText {
} }
@NbBundle.Messages({ @NbBundle.Messages({
"RawText.FileText=File Text", "SolrIndexedText.FileText=File Text",
"RawText.ResultText=Result Text"}) "SolrIndexedText.ResultText=Result Text"})
@Override @Override
public String toString() { public String toString() {
if (null != content) { if (null != content) {
return Bundle.RawText_FileText(); return Bundle.SolrIndexedText_FileText();
} else { } else {
return Bundle.RawText_ResultText(); return Bundle.SolrIndexedText_ResultText();
} }
} }