More work

This commit is contained in:
eugene.livis 2023-06-06 14:14:37 -04:00
parent 8494453a09
commit e3da0cae14
4 changed files with 22 additions and 67 deletions

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2021 Basis Technology Corp.
* Copyright 2011-2023 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -659,14 +659,8 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP
int totalPages = source.getNumberPages();
updateTotalPagesDisplay(totalPages);
// ELTODO
//if (totalPages < 2) {
// enableNextPageControl(false);
// enablePrevPageControl(false);
//} else {
enableNextPageControl(source.hasNextPage());
enablePrevPageControl(source.hasPreviousPage());
//}
enableNextPageControl(source.hasNextPage());
enablePrevPageControl(source.hasPreviousPage());
}
/**

View File

@ -32,46 +32,33 @@ import org.sleuthkit.autopsy.textextractors.TextExtractor;
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
import org.sleuthkit.datamodel.AbstractFile;
/** ELTODO
* A "source" for the extracted abstractFile viewer that displays "raw" (not
* highlighted) indexed text for a file or an artifact.
/**
* A "source" for abstractFile viewer that displays "raw" extracted text for a
* file. Only supports file types for which there are text extractors. Uses
* chunking algorithm used by KeywordSearchIngestModule. The readers used in
* chunking don't have ability to go backwards or to fast forward to a specific
* offset. Therefore there is no way to scroll pages back, or to determine how
* many total pages there are.
*/
class ExtractedText implements IndexedText {
private int numPages = 0;
private int currentPage = 0;
private final AbstractFile abstractFile;
private final long objectId;
private Chunker chunker = null;
private static final Logger logger = Logger.getLogger(ExtractedText.class.getName());
/**
* Construct a new ExtractedText object for the given content and object id.
* This constructor needs both a content object and an object id because the
* ExtractedText implementation attempts to provide useful messages in the
* text content viewer for (a) the case where a file has not been indexed
* because known files are being skipped and (b) the case where the file
* content has not yet been indexed.
* Construct a new ExtractedText object for the given abstract file.
*
* @param file Abstract file.
* @param objectId Either a file id or an artifact id.
* @param file Abstract file.
*/
ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
ExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
this.abstractFile = file;
this.objectId = objectId;
this.numPages = -1; // We don't know how many pages there are until we reach end of the document
initialize();
}
/**
* Return the ID that this object is associated with -- to help with caching
*
* @return
*/
public long getObjectId() {
return this.objectId;
}
@Override
public int getCurrentPage() {
return this.currentPage;
@ -177,9 +164,6 @@ class ExtractedText implements IndexedText {
return numPages;
}
/**
* Set the internal values, such as pages
*/
private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
@ -194,7 +178,6 @@ class ExtractedText implements IndexedText {
/**
* Extract text from abstractFile
*
* @param node a node that has extracted abstractFile
* @param currentPage currently used page
*
* @return the extracted text
@ -209,7 +192,7 @@ class ExtractedText implements IndexedText {
logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
throw chunker.getException();
}
indexedText = chunk.toString();
} else {
return Bundle.IndexedText_errorMessage_errorGettingText();
@ -229,9 +212,7 @@ class ExtractedText implements IndexedText {
try {
Map<String, String> metadata = extractor.getMetadata();
if (!metadata.isEmpty()) {
// Creating the metadata artifact here causes occasional problems
// when indexing the text, so we save the metadata map to
// use after this method is complete.
// save the metadata map to use after this method is complete.
extractedMetadata.putAll(metadata);
}
CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
@ -249,9 +230,8 @@ class ExtractedText implements IndexedText {
//Just send file text.
finalReader = fileText;
}
//divide into chunks and index
//divide into chunks
return finalReader;
}
}

View File

@ -1,7 +1,7 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2011-2019 Basis Technology Corp.
* Copyright 2011-2023 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -28,7 +28,6 @@ import java.util.List;
import java.util.logging.Level;
import org.apache.tika.mime.MimeTypes;
import org.openide.nodes.Node;
import org.openide.util.Exceptions;
import org.openide.util.Lookup;
import org.openide.util.NbBundle;
import org.openide.util.lookup.ServiceProvider;
@ -36,7 +35,6 @@ import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.ingest.IngestModule;
import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.autopsy.textextractors.TextExtractor;
@ -178,7 +176,7 @@ public class ExtractedTextViewer implements TextViewer {
// see if it's a file type for which we can extract text
if (ableToExtractTextFromFile(file)) {
try {
rawContentText = new ExtractedText(file, file.getId());
rawContentText = new ExtractedText(file);
sources.add(rawContentText);
} catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) {
// do nothing
@ -451,7 +449,9 @@ public class ExtractedTextViewer implements TextViewer {
}
/**
* Check if we can extract text for this file type.
* Check if we can extract text for this file type using one of our text extractors.
* NOTE: the logic in this method should be similar and based on the
* logic of how KeywordSearchIngestModule decides which files to index.
*
* @param file Abstract File
*
@ -465,12 +465,6 @@ public class ExtractedTextViewer implements TextViewer {
return false;
}
/**
* Extract unicode strings from unallocated and unused blocks and carved
* text files. The reason for performing string extraction on these is
* because they all may contain multiple encodings which can cause text
* to be missed by the more specialized text extractors.
*/
if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
|| fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
|| (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) {
@ -478,22 +472,10 @@ public class ExtractedTextViewer implements TextViewer {
}
final long size = file.getSize();
//if not to index content, or a dir, or 0 content, index meta data only
if (file.isDir() || size == 0) {
return false;
}
// ELTODO do we need to skip text files here? probably not.
if (file.getNameExtension().equalsIgnoreCase("txt")) {
return false;
}
// ELTODO do we need to skip known files here? probably not.
if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) {
return false;
}
String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase();
if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) {
@ -501,7 +483,7 @@ public class ExtractedTextViewer implements TextViewer {
}
if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
// ELTODO return false;
return false;
}
return true;

View File

@ -38,7 +38,6 @@ import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.stream.Collectors;
import org.apache.tika.mime.MimeTypes;
import org.openide.util.Exceptions;
import org.openide.util.Lookup;
import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages;