More work

This commit is contained in:
eugene.livis 2023-06-06 14:14:37 -04:00
parent 8494453a09
commit e3da0cae14
4 changed files with 22 additions and 67 deletions

View File

@ -1,7 +1,7 @@
/* /*
* Autopsy Forensic Browser * Autopsy Forensic Browser
* *
* Copyright 2011-2021 Basis Technology Corp. * Copyright 2011-2023 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org * Contact: carrier <at> sleuthkit <dot> org
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -659,14 +659,8 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP
int totalPages = source.getNumberPages(); int totalPages = source.getNumberPages();
updateTotalPagesDisplay(totalPages); updateTotalPagesDisplay(totalPages);
// ELTODO enableNextPageControl(source.hasNextPage());
//if (totalPages < 2) { enablePrevPageControl(source.hasPreviousPage());
// enableNextPageControl(false);
// enablePrevPageControl(false);
//} else {
enableNextPageControl(source.hasNextPage());
enablePrevPageControl(source.hasPreviousPage());
//}
} }
/** /**

View File

@ -32,46 +32,33 @@ import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
/** ELTODO /**
* A "source" for the extracted abstractFile viewer that displays "raw" (not * A "source" for abstractFile viewer that displays "raw" extracted text for a
* highlighted) indexed text for a file or an artifact. * file. Only supports file types for which there are text extractors. Uses
* chunking algorithm used by KeywordSearchIngestModule. The readers used in
* chunking don't have ability to go backwards or to fast forward to a specific
* offset. Therefore there is no way to scroll pages back, or to determine how
* many total pages there are.
*/ */
class ExtractedText implements IndexedText { class ExtractedText implements IndexedText {
private int numPages = 0; private int numPages = 0;
private int currentPage = 0; private int currentPage = 0;
private final AbstractFile abstractFile; private final AbstractFile abstractFile;
private final long objectId;
private Chunker chunker = null; private Chunker chunker = null;
private static final Logger logger = Logger.getLogger(ExtractedText.class.getName()); private static final Logger logger = Logger.getLogger(ExtractedText.class.getName());
/** /**
* Construct a new ExtractedText object for the given content and object id. * Construct a new ExtractedText object for the given abstract file.
* This constructor needs both a content object and an object id because the
* ExtractedText implementation attempts to provide useful messages in the
* text content viewer for (a) the case where a file has not been indexed
* because known files are being skipped and (b) the case where the file
* content has not yet been indexed.
* *
* @param file Abstract file. * @param file Abstract file.
* @param objectId Either a file id or an artifact id.
*/ */
ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { ExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
this.abstractFile = file; this.abstractFile = file;
this.objectId = objectId;
this.numPages = -1; // We don't know how many pages there are until we reach end of the document this.numPages = -1; // We don't know how many pages there are until we reach end of the document
initialize(); initialize();
} }
/**
* Return the ID that this object is associated with -- to help with caching
*
* @return
*/
public long getObjectId() {
return this.objectId;
}
@Override @Override
public int getCurrentPage() { public int getCurrentPage() {
return this.currentPage; return this.currentPage;
@ -177,9 +164,6 @@ class ExtractedText implements IndexedText {
return numPages; return numPages;
} }
/**
* Set the internal values, such as pages
*/
private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
@ -194,7 +178,6 @@ class ExtractedText implements IndexedText {
/** /**
* Extract text from abstractFile * Extract text from abstractFile
* *
* @param node a node that has extracted abstractFile
* @param currentPage currently used page * @param currentPage currently used page
* *
* @return the extracted text * @return the extracted text
@ -209,7 +192,7 @@ class ExtractedText implements IndexedText {
logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException()); logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
throw chunker.getException(); throw chunker.getException();
} }
indexedText = chunk.toString(); indexedText = chunk.toString();
} else { } else {
return Bundle.IndexedText_errorMessage_errorGettingText(); return Bundle.IndexedText_errorMessage_errorGettingText();
@ -229,9 +212,7 @@ class ExtractedText implements IndexedText {
try { try {
Map<String, String> metadata = extractor.getMetadata(); Map<String, String> metadata = extractor.getMetadata();
if (!metadata.isEmpty()) { if (!metadata.isEmpty()) {
// Creating the metadata artifact here causes occasional problems // save the metadata map to use after this method is complete.
// when indexing the text, so we save the metadata map to
// use after this method is complete.
extractedMetadata.putAll(metadata); extractedMetadata.putAll(metadata);
} }
CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata); CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
@ -249,9 +230,8 @@ class ExtractedText implements IndexedText {
//Just send file text. //Just send file text.
finalReader = fileText; finalReader = fileText;
} }
//divide into chunks and index //divide into chunks
return finalReader; return finalReader;
} }
} }

View File

@ -1,7 +1,7 @@
/* /*
* Autopsy Forensic Browser * Autopsy Forensic Browser
* *
* Copyright 2011-2019 Basis Technology Corp. * Copyright 2011-2023 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org * Contact: carrier <at> sleuthkit <dot> org
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -28,7 +28,6 @@ import java.util.List;
import java.util.logging.Level; import java.util.logging.Level;
import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.MimeTypes;
import org.openide.nodes.Node; import org.openide.nodes.Node;
import org.openide.util.Exceptions;
import org.openide.util.Lookup; import org.openide.util.Lookup;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.lookup.ServiceProvider; import org.openide.util.lookup.ServiceProvider;
@ -36,7 +35,6 @@ import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer; import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.ingest.IngestModule;
import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult; import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.TextExtractor; import org.sleuthkit.autopsy.textextractors.TextExtractor;
@ -178,7 +176,7 @@ public class ExtractedTextViewer implements TextViewer {
// see if it's a file type for which we can extract text // see if it's a file type for which we can extract text
if (ableToExtractTextFromFile(file)) { if (ableToExtractTextFromFile(file)) {
try { try {
rawContentText = new ExtractedText(file, file.getId()); rawContentText = new ExtractedText(file);
sources.add(rawContentText); sources.add(rawContentText);
} catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
// do nothing // do nothing
@ -451,7 +449,9 @@ public class ExtractedTextViewer implements TextViewer {
} }
/** /**
* Check if we can extract text for this file type. * Check if we can extract text for this file type using one of our text extractors.
* NOTE: the logic in this method should be similar and based on the
* logic of how KeywordSearchIngestModule decides which files to index.
* *
* @param file Abstract File * @param file Abstract File
* *
@ -465,12 +465,6 @@ public class ExtractedTextViewer implements TextViewer {
return false; return false;
} }
/**
* Extract unicode strings from unallocated and unused blocks and carved
* text files. The reason for performing string extraction on these is
* because they all may contain multiple encodings which can cause text
* to be missed by the more specialized text extractors.
*/
if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
|| fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)) || fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
|| (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) { || (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) {
@ -478,22 +472,10 @@ public class ExtractedTextViewer implements TextViewer {
} }
final long size = file.getSize(); final long size = file.getSize();
//if not to index content, or a dir, or 0 content, index meta data only
if (file.isDir() || size == 0) { if (file.isDir() || size == 0) {
return false; return false;
} }
// ELTODO do we need to skip text files here? probably not.
if (file.getNameExtension().equalsIgnoreCase("txt")) {
return false;
}
// ELTODO do we need to skip known files here? probably not.
if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) {
return false;
}
String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase(); String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase();
if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) { if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) {
@ -501,7 +483,7 @@ public class ExtractedTextViewer implements TextViewer {
} }
if (MimeTypes.OCTET_STREAM.equals(mimeType)) { if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
// ELTODO return false; return false;
} }
return true; return true;

View File

@ -38,7 +38,6 @@ import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.MimeTypes;
import org.openide.util.Exceptions;
import org.openide.util.Lookup; import org.openide.util.Lookup;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages; import org.openide.util.NbBundle.Messages;