mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-06 21:00:22 +00:00
First cut
This commit is contained in:
parent
99e08e8dbe
commit
0068d3acfd
270
KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
Executable file
270
KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java
Executable file
@ -0,0 +1,270 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2023 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import com.google.common.io.CharSource;
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import org.openide.util.NbBundle;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A "source" for the extracted abstractFile viewer that displays "raw" (not
|
||||||
|
* highlighted) indexed text for a file or an artifact.
|
||||||
|
*/
|
||||||
|
class ExtractedText implements IndexedText { // ELTODO
|
||||||
|
|
||||||
|
private int numPages = 0;
|
||||||
|
private int currentPage = 0;
|
||||||
|
private final AbstractFile abstractFile;
|
||||||
|
private final long objectId;
|
||||||
|
//keep last abstractFile cached
|
||||||
|
private String cachedString;
|
||||||
|
private int cachedChunk;
|
||||||
|
private Chunker chunker = null;
|
||||||
|
private static final Logger logger = Logger.getLogger(ExtractedText.class.getName());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct a new ExtractedText object for the given content and object id.
|
||||||
|
* This constructor needs both a content object and an object id because the
|
||||||
|
* ExtractedText implementation attempts to provide useful messages in the
|
||||||
|
* text content viewer for (a) the case where a file has not been indexed
|
||||||
|
* because known files are being skipped and (b) the case where the file
|
||||||
|
* content has not yet been indexed.
|
||||||
|
*
|
||||||
|
* @param file Abstract file.
|
||||||
|
* @param objectId Either a file id or an artifact id.
|
||||||
|
*/
|
||||||
|
ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
|
||||||
|
this.abstractFile = file;
|
||||||
|
this.objectId = objectId;
|
||||||
|
this.currentPage = 0; // ELTODO
|
||||||
|
this.numPages = 1;
|
||||||
|
initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the ID that this object is associated with -- to help with caching
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public long getObjectId() {
|
||||||
|
return this.objectId;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getCurrentPage() {
|
||||||
|
return this.currentPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNextPage() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasPreviousPage() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextPage() {
|
||||||
|
if (!hasNextPage()) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg"));
|
||||||
|
}
|
||||||
|
++currentPage;
|
||||||
|
return currentPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int previousPage() {
|
||||||
|
if (!hasPreviousPage()) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg"));
|
||||||
|
}
|
||||||
|
--currentPage;
|
||||||
|
return currentPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNextItem() {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasPreviousItem() {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextItem() {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int previousItem() {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int currentItem() {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getText() {
|
||||||
|
try {
|
||||||
|
return getContentText(currentPage + 1); // ELTODO
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS
|
||||||
|
}
|
||||||
|
return Bundle.IndexedText_errorMessage_errorGettingText();
|
||||||
|
}
|
||||||
|
|
||||||
|
@NbBundle.Messages({
|
||||||
|
"ExtractedText.FileText=File Text"})
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return Bundle.ExtractedText_FileText();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isSearchable() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getAnchorPrefix() {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getNumberHits() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getNumberPages() {
|
||||||
|
return numPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the internal values, such as pages
|
||||||
|
*/
|
||||||
|
private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
|
||||||
|
TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
|
||||||
|
|
||||||
|
Map<String, String> extractedMetadata = new HashMap<>();
|
||||||
|
Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
|
||||||
|
|
||||||
|
//Get a reader for the content of the given source
|
||||||
|
BufferedReader reader = new BufferedReader(sourceReader);
|
||||||
|
chunker = new Chunker(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text from abstractFile
|
||||||
|
*
|
||||||
|
* @param node a node that has extracted abstractFile
|
||||||
|
* @param currentPage currently used page
|
||||||
|
*
|
||||||
|
* @return the extracted text
|
||||||
|
*/
|
||||||
|
private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception {
|
||||||
|
|
||||||
|
// ELTODO
|
||||||
|
//check if cached
|
||||||
|
if (cachedString != null) {
|
||||||
|
if (cachedChunk == currentPage) {
|
||||||
|
return cachedString;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String indexedText;
|
||||||
|
if (chunker.hasNext()) {
|
||||||
|
Chunker.Chunk chunk = chunker.next();
|
||||||
|
chunk.setChunkId(currentPage);
|
||||||
|
|
||||||
|
if (chunker.hasException()) {
|
||||||
|
logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
|
||||||
|
throw chunker.getException();
|
||||||
|
}
|
||||||
|
|
||||||
|
indexedText = chunk.toString();
|
||||||
|
} else {
|
||||||
|
return Bundle.IndexedText_errorMessage_errorGettingText();
|
||||||
|
}
|
||||||
|
|
||||||
|
cachedString = EscapeUtil.escapeHtml(indexedText).trim();
|
||||||
|
StringBuilder sb = new StringBuilder(cachedString.length() + 20);
|
||||||
|
sb.append("<pre>").append(cachedString).append("</pre>"); //NON-NLS
|
||||||
|
cachedString = sb.toString();
|
||||||
|
cachedChunk = currentPage;
|
||||||
|
|
||||||
|
return cachedString;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
|
||||||
|
Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
|
||||||
|
|
||||||
|
Reader fileText = extractor.getReader();
|
||||||
|
Reader finalReader;
|
||||||
|
try {
|
||||||
|
Map<String, String> metadata = extractor.getMetadata();
|
||||||
|
if (!metadata.isEmpty()) {
|
||||||
|
// Creating the metadata artifact here causes occasional problems
|
||||||
|
// when indexing the text, so we save the metadata map to
|
||||||
|
// use after this method is complete.
|
||||||
|
extractedMetadata.putAll(metadata);
|
||||||
|
}
|
||||||
|
CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
|
||||||
|
//Append the metadata to end of the file text
|
||||||
|
finalReader = CharSource.concat(new CharSource() {
|
||||||
|
//Wrap fileText reader for concatenation
|
||||||
|
@Override
|
||||||
|
public Reader openStream() throws IOException {
|
||||||
|
return fileText;
|
||||||
|
}
|
||||||
|
}, formattedMetadata).openStream();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
|
||||||
|
aFile.getName(), aFile.getId()), ex);
|
||||||
|
//Just send file text.
|
||||||
|
finalReader = fileText;
|
||||||
|
}
|
||||||
|
//divide into chunks and index
|
||||||
|
return finalReader;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -26,7 +26,9 @@ import java.util.ArrayList;
|
|||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
|
import org.apache.tika.mime.MimeTypes;
|
||||||
import org.openide.nodes.Node;
|
import org.openide.nodes.Node;
|
||||||
|
import org.openide.util.Exceptions;
|
||||||
import org.openide.util.Lookup;
|
import org.openide.util.Lookup;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.util.lookup.ServiceProvider;
|
import org.openide.util.lookup.ServiceProvider;
|
||||||
@ -34,7 +36,11 @@ import org.sleuthkit.autopsy.casemodule.Case;
|
|||||||
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
|
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
|
||||||
import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
|
import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
|
import org.sleuthkit.autopsy.ingest.IngestModule;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
|
import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
|
||||||
|
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.Account;
|
import org.sleuthkit.datamodel.Account;
|
||||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
@ -45,6 +51,7 @@ import static org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASS
|
|||||||
import org.sleuthkit.datamodel.Content;
|
import org.sleuthkit.datamodel.Content;
|
||||||
import org.sleuthkit.datamodel.Report;
|
import org.sleuthkit.datamodel.Report;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
import org.sleuthkit.datamodel.TskData;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A text viewer that displays the indexed text associated with a file or an
|
* A text viewer that displays the indexed text associated with a file or an
|
||||||
@ -61,14 +68,20 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
private ExtractedContentPanel panel;
|
private ExtractedContentPanel panel;
|
||||||
private volatile Node currentNode = null;
|
private volatile Node currentNode = null;
|
||||||
private IndexedText currentSource = null;
|
private IndexedText currentSource = null;
|
||||||
|
private FileTypeDetector fileTypeDetector = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a text viewer that displays the indexed text associated with a
|
* Constructs a text viewer that displays the indexed text associated with a
|
||||||
* file or an artifact, possibly marked up with HTML to highlight keyword
|
* file or an artifact, possibly marked up with HTML to highlight keyword
|
||||||
* hits.
|
* hits. If text for the Content has not been fully indexed by Solr then
|
||||||
|
* attempt to extract text using one of text extractors.
|
||||||
*/
|
*/
|
||||||
public ExtractedTextViewer() {
|
public ExtractedTextViewer() {
|
||||||
// This constructor is intentionally empty.
|
try {
|
||||||
|
fileTypeDetector = new FileTypeDetector();
|
||||||
|
} catch (FileTypeDetector.FileTypeDetectorInitException ex) {
|
||||||
|
logger.log(Level.SEVERE, "Failed to initialize FileTypeDetector", ex); //NON-NLS
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -155,16 +168,31 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
*/
|
*/
|
||||||
IndexedText rawContentText = null;
|
IndexedText rawContentText = null;
|
||||||
if (file != null) {
|
if (file != null) {
|
||||||
rawContentText = new RawText(file, file.getId());
|
|
||||||
|
// see if Solr has fully indexed this file
|
||||||
|
if (solrHasFullyIndexedContent(file.getId())) {
|
||||||
|
rawContentText = new SolrIndexedText(file, file.getId());
|
||||||
sources.add(rawContentText);
|
sources.add(rawContentText);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Solr does not have fully indexed content.
|
||||||
|
// see if it's a file type for which we can extract text
|
||||||
|
if (ableToExtractTextFromFile(file)) {
|
||||||
|
try {
|
||||||
|
rawContentText = new ExtractedText(file, file.getId());
|
||||||
|
sources.add(rawContentText);
|
||||||
|
} catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add the "raw" (not highlighted) text, if any, for any report
|
* Add the "raw" (not highlighted) text, if any, for any report
|
||||||
* associated with the node.
|
* associated with the node.
|
||||||
*/
|
*/
|
||||||
if (report != null) {
|
if (report != null) {
|
||||||
rawContentText = new RawText(report, report.getId());
|
rawContentText = new SolrIndexedText(report, report.getId());
|
||||||
sources.add(rawContentText);
|
sources.add(rawContentText);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -222,12 +250,11 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
if (attribute != null) {
|
if (attribute != null) {
|
||||||
long artifactId = attribute.getValueLong();
|
long artifactId = attribute.getValueLong();
|
||||||
BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId);
|
BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId);
|
||||||
rawArtifactText = new RawText(associatedArtifact, associatedArtifact.getArtifactID());
|
rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
rawArtifactText = new RawText(artifact, artifact.getArtifactID());
|
rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rawArtifactText;
|
return rawArtifactText;
|
||||||
@ -340,10 +367,20 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
* data source instead of a file.
|
* data source instead of a file.
|
||||||
*/
|
*/
|
||||||
AbstractFile file = node.getLookup().lookup(AbstractFile.class);
|
AbstractFile file = node.getLookup().lookup(AbstractFile.class);
|
||||||
if (file != null && solrHasContent(file.getId())) {
|
if (file != null) {
|
||||||
|
|
||||||
|
// see if Solr has fully indexed this file
|
||||||
|
if (solrHasFullyIndexedContent(file.getId())) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Solr does not have fully indexed content.
|
||||||
|
// see if it's a file type for which we can extract text
|
||||||
|
if (ableToExtractTextFromFile(file)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the lookup of the node contains an artifact that is neither a
|
* If the lookup of the node contains an artifact that is neither a
|
||||||
* keyword hit artifact nor a credit card account artifact, and the
|
* keyword hit artifact nor a credit card account artifact, and the
|
||||||
@ -351,7 +388,7 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
* indexed text for the artifact.
|
* indexed text for the artifact.
|
||||||
*/
|
*/
|
||||||
if (artifact != null) {
|
if (artifact != null) {
|
||||||
return solrHasContent(artifact.getArtifactID());
|
return solrHasFullyIndexedContent(artifact.getArtifactID());
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -361,7 +398,7 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
*/
|
*/
|
||||||
Report report = node.getLookup().lookup(Report.class);
|
Report report = node.getLookup().lookup(Report.class);
|
||||||
if (report != null) {
|
if (report != null) {
|
||||||
return solrHasContent(report.getId());
|
return solrHasFullyIndexedContent(report.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -397,12 +434,14 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
*
|
*
|
||||||
* @return true if Solr has content, else false
|
* @return true if Solr has content, else false
|
||||||
*/
|
*/
|
||||||
private boolean solrHasContent(Long objectId) {
|
private boolean solrHasFullyIndexedContent(Long objectId) {
|
||||||
final Server solrServer = KeywordSearch.getServer();
|
final Server solrServer = KeywordSearch.getServer();
|
||||||
if (solrServer.coreIsOpen() == false) {
|
if (solrServer.coreIsOpen() == false) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ELTODO get total number of chunks in the file, and verify that
|
||||||
|
// all of the chunks have been indexed.
|
||||||
try {
|
try {
|
||||||
return solrServer.queryIsIndexed(objectId);
|
return solrServer.queryIsIndexed(objectId);
|
||||||
} catch (NoOpenCoreException | KeywordSearchModuleException ex) {
|
} catch (NoOpenCoreException | KeywordSearchModuleException ex) {
|
||||||
@ -411,6 +450,63 @@ public class ExtractedTextViewer implements TextViewer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if we can extract text for this file type.
|
||||||
|
*
|
||||||
|
* @param file Abstract File
|
||||||
|
*
|
||||||
|
* @return true if text can be extracted from file, else false
|
||||||
|
*/
|
||||||
|
private boolean ableToExtractTextFromFile(AbstractFile file) {
|
||||||
|
|
||||||
|
TskData.TSK_DB_FILES_TYPE_ENUM fileType = file.getType();
|
||||||
|
|
||||||
|
if (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract unicode strings from unallocated and unused blocks and carved
|
||||||
|
* text files. The reason for performing string extraction on these is
|
||||||
|
* because they all may contain multiple encodings which can cause text
|
||||||
|
* to be missed by the more specialized text extractors.
|
||||||
|
*/
|
||||||
|
if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
|
||||||
|
|| fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
|
||||||
|
|| (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
final long size = file.getSize();
|
||||||
|
//if not to index content, or a dir, or 0 content, index meta data only
|
||||||
|
|
||||||
|
if (file.isDir() || size == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ELTODO do we need to skip text files here? probably not.
|
||||||
|
if (file.getNameExtension().equalsIgnoreCase("txt")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ELTODO do we need to skip known files here? probably not.
|
||||||
|
if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase();
|
||||||
|
|
||||||
|
if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Listener to select the next match found in the text
|
* Listener to select the next match found in the text
|
||||||
*/
|
*/
|
||||||
|
@ -96,7 +96,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
* generally text extractors should ignore archives and let unpacking
|
* generally text extractors should ignore archives and let unpacking
|
||||||
* modules take care of them
|
* modules take care of them
|
||||||
*/
|
*/
|
||||||
private static final List<String> ARCHIVE_MIME_TYPES
|
static final List<String> ARCHIVE_MIME_TYPES
|
||||||
= ImmutableList.of(
|
= ImmutableList.of(
|
||||||
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
//ignore unstructured binary and compressed data, for which string extraction or unzipper works better
|
||||||
"application/x-7z-compressed", //NON-NLS
|
"application/x-7z-compressed", //NON-NLS
|
||||||
@ -683,7 +683,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
@NbBundle.Messages({
|
@NbBundle.Messages({
|
||||||
"KeywordSearchIngestModule.metadataTitle=METADATA"
|
"KeywordSearchIngestModule.metadataTitle=METADATA"
|
||||||
})
|
})
|
||||||
private CharSource getMetaDataCharSource(Map<String, String> metadata) {
|
static CharSource getMetaDataCharSource(Map<String, String> metadata) {
|
||||||
return CharSource.wrap(new StringBuilder(
|
return CharSource.wrap(new StringBuilder(
|
||||||
String.format("\n\n------------------------------%s------------------------------\n\n",
|
String.format("\n\n------------------------------%s------------------------------\n\n",
|
||||||
Bundle.KeywordSearchIngestModule_metadataTitle()))
|
Bundle.KeywordSearchIngestModule_metadataTitle()))
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Autopsy Forensic Browser
|
* Autopsy Forensic Browser
|
||||||
*
|
*
|
||||||
* Copyright 2011-2018 Basis Technology Corp.
|
* Copyright 2011-2023 Basis Technology Corp.
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -30,9 +30,9 @@ import org.sleuthkit.datamodel.TskData;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* A "source" for the extracted content viewer that displays "raw" (not
|
* A "source" for the extracted content viewer that displays "raw" (not
|
||||||
* highlighted) indexed text for a file or an artifact.
|
* highlighted) Solr indexed text for a file or an artifact.
|
||||||
*/
|
*/
|
||||||
class RawText implements IndexedText {
|
class SolrIndexedText implements IndexedText {
|
||||||
|
|
||||||
private int numPages = 0;
|
private int numPages = 0;
|
||||||
private int currentPage = 0;
|
private int currentPage = 0;
|
||||||
@ -43,12 +43,12 @@ class RawText implements IndexedText {
|
|||||||
//keep last content cached
|
//keep last content cached
|
||||||
private String cachedString;
|
private String cachedString;
|
||||||
private int cachedChunk;
|
private int cachedChunk;
|
||||||
private static final Logger logger = Logger.getLogger(RawText.class.getName());
|
private static final Logger logger = Logger.getLogger(SolrIndexedText.class.getName());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a new RawText object for the given content and object id. This
|
* Construct a new SolrIndexedText object for the given content and object id. This
|
||||||
* constructor needs both a content object and an object id because the
|
* constructor needs both a content object and an object id because the
|
||||||
* RawText implementation attempts to provide useful messages in the text
|
* SolrIndexedText implementation attempts to provide useful messages in the text
|
||||||
* content viewer for (a) the case where a file has not been indexed because
|
* content viewer for (a) the case where a file has not been indexed because
|
||||||
* known files are being skipped and (b) the case where the file content has
|
* known files are being skipped and (b) the case where the file content has
|
||||||
* not yet been indexed.
|
* not yet been indexed.
|
||||||
@ -56,14 +56,14 @@ class RawText implements IndexedText {
|
|||||||
* @param content Used to get access to file names and "known" status.
|
* @param content Used to get access to file names and "known" status.
|
||||||
* @param objectId Either a file id or an artifact id.
|
* @param objectId Either a file id or an artifact id.
|
||||||
*/
|
*/
|
||||||
RawText(Content content, long objectId) {
|
SolrIndexedText(Content content, long objectId) {
|
||||||
this.content = content;
|
this.content = content;
|
||||||
this.blackboardArtifact = null;
|
this.blackboardArtifact = null;
|
||||||
this.objectId = objectId;
|
this.objectId = objectId;
|
||||||
initialize();
|
initialize();
|
||||||
}
|
}
|
||||||
|
|
||||||
RawText(BlackboardArtifact bba, long objectId) {
|
SolrIndexedText(BlackboardArtifact bba, long objectId) {
|
||||||
this.content = null;
|
this.content = null;
|
||||||
this.blackboardArtifact = bba;
|
this.blackboardArtifact = bba;
|
||||||
this.objectId = objectId;
|
this.objectId = objectId;
|
||||||
@ -159,14 +159,14 @@ class RawText implements IndexedText {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@NbBundle.Messages({
|
@NbBundle.Messages({
|
||||||
"RawText.FileText=File Text",
|
"SolrIndexedText.FileText=File Text",
|
||||||
"RawText.ResultText=Result Text"})
|
"SolrIndexedText.ResultText=Result Text"})
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
if (null != content) {
|
if (null != content) {
|
||||||
return Bundle.RawText_FileText();
|
return Bundle.SolrIndexedText_FileText();
|
||||||
} else {
|
} else {
|
||||||
return Bundle.RawText_ResultText();
|
return Bundle.SolrIndexedText_ResultText();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user