From 0068d3acfdefb46793d3d5a5cffe2ac73e161c44 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Mon, 5 Jun 2023 17:23:56 -0400 Subject: [PATCH 1/9] First cut --- .../autopsy/keywordsearch/ExtractedText.java | 270 ++++++++++++++++++ .../keywordsearch/ExtractedTextViewer.java | 122 +++++++- .../KeywordSearchIngestModule.java | 4 +- .../{RawText.java => SolrIndexedText.java} | 24 +- 4 files changed, 393 insertions(+), 27 deletions(-) create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java rename KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/{RawText.java => SolrIndexedText.java} (92%) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java new file mode 100755 index 0000000000..c12b34e93b --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java @@ -0,0 +1,270 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2023 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import com.google.common.io.CharSource; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Level; +import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.coreutils.EscapeUtil; +import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.textextractors.TextExtractor; +import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; +import org.sleuthkit.datamodel.AbstractFile; + +/** + * A "source" for the extracted abstractFile viewer that displays "raw" (not + * highlighted) indexed text for a file or an artifact. + */ +class ExtractedText implements IndexedText { // ELTODO + + private int numPages = 0; + private int currentPage = 0; + private final AbstractFile abstractFile; + private final long objectId; + //keep last abstractFile cached + private String cachedString; + private int cachedChunk; + private Chunker chunker = null; + private static final Logger logger = Logger.getLogger(ExtractedText.class.getName()); + + /** + * Construct a new ExtractedText object for the given content and object id. + * This constructor needs both a content object and an object id because the + * ExtractedText implementation attempts to provide useful messages in the + * text content viewer for (a) the case where a file has not been indexed + * because known files are being skipped and (b) the case where the file + * content has not yet been indexed. + * + * @param file Abstract file. + * @param objectId Either a file id or an artifact id. + */ + ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { + this.abstractFile = file; + this.objectId = objectId; + this.currentPage = 0; // ELTODO + this.numPages = 1; + initialize(); + } + + /** + * Return the ID that this object is associated with -- to help with caching + * + * @return + */ + public long getObjectId() { + return this.objectId; + } + + @Override + public int getCurrentPage() { + return this.currentPage; + } + + @Override + public boolean hasNextPage() { + return true; + } + + @Override + public boolean hasPreviousPage() { + return false; + } + + @Override + public int nextPage() { + if (!hasNextPage()) { + throw new IllegalStateException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg")); + } + ++currentPage; + return currentPage; + } + + @Override + public int previousPage() { + if (!hasPreviousPage()) { + throw new IllegalStateException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg")); + } + --currentPage; + return currentPage; + } + + @Override + public boolean hasNextItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg")); + } + + @Override + public boolean hasPreviousItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg")); + } + + @Override + public int nextItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg")); + } + + @Override + public int previousItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg")); + } + + @Override + public int currentItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg")); + } + + @Override + public String getText() { + try { + return getContentText(currentPage + 1); // ELTODO + } catch (Exception ex) { + logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS + } + return Bundle.IndexedText_errorMessage_errorGettingText(); + } + + @NbBundle.Messages({ + "ExtractedText.FileText=File Text"}) + @Override + public String toString() { + return Bundle.ExtractedText_FileText(); + } + + @Override + public boolean isSearchable() { + return false; + } + + @Override + public String getAnchorPrefix() { + return ""; + } + + @Override + public int getNumberHits() { + return 0; + } + + @Override + public int getNumberPages() { + return numPages; + } + + /** + * Set the internal values, such as pages + */ + private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { + TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); + + Map extractedMetadata = new HashMap<>(); + Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata); + + //Get a reader for the content of the given source + BufferedReader reader = new BufferedReader(sourceReader); + chunker = new Chunker(reader); + } + + /** + * Extract text from abstractFile + * + * @param node a node that has extracted abstractFile + * @param currentPage currently used page + * + * @return the extracted text + */ + private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception { + + // ELTODO + //check if cached + if (cachedString != null) { + if (cachedChunk == currentPage) { + return cachedString; + } + } + + String indexedText; + if (chunker.hasNext()) { + Chunker.Chunk chunk = chunker.next(); + chunk.setChunkId(currentPage); + + if (chunker.hasException()) { + logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException()); + throw chunker.getException(); + } + + indexedText = chunk.toString(); + } else { + return Bundle.IndexedText_errorMessage_errorGettingText(); + } + + cachedString = EscapeUtil.escapeHtml(indexedText).trim(); + StringBuilder sb = new StringBuilder(cachedString.length() + 20); + sb.append("
").append(cachedString).append("
"); //NON-NLS + cachedString = sb.toString(); + cachedChunk = currentPage; + + return cachedString; + } + + private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile, + Map extractedMetadata) throws TextExtractor.InitReaderException { + + Reader fileText = extractor.getReader(); + Reader finalReader; + try { + Map metadata = extractor.getMetadata(); + if (!metadata.isEmpty()) { + // Creating the metadata artifact here causes occasional problems + // when indexing the text, so we save the metadata map to + // use after this method is complete. + extractedMetadata.putAll(metadata); + } + CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata); + //Append the metadata to end of the file text + finalReader = CharSource.concat(new CharSource() { + //Wrap fileText reader for concatenation + @Override + public Reader openStream() throws IOException { + return fileText; + } + }, formattedMetadata).openStream(); + } catch (IOException ex) { + logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]", + aFile.getName(), aFile.getId()), ex); + //Just send file text. + finalReader = fileText; + } + //divide into chunks and index + return finalReader; + + } + +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index aac5757fc0..a33105e4fc 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -26,7 +26,9 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.logging.Level; +import org.apache.tika.mime.MimeTypes; import org.openide.nodes.Node; +import org.openide.util.Exceptions; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.lookup.ServiceProvider; @@ -34,7 +36,11 @@ import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer; import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.ingest.IngestModule; import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult; +import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; +import org.sleuthkit.autopsy.textextractors.TextExtractor; +import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.Account; import org.sleuthkit.datamodel.BlackboardArtifact; @@ -45,6 +51,7 @@ import static org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASS import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.Report; import org.sleuthkit.datamodel.TskCoreException; +import org.sleuthkit.datamodel.TskData; /** * A text viewer that displays the indexed text associated with a file or an @@ -61,14 +68,20 @@ public class ExtractedTextViewer implements TextViewer { private ExtractedContentPanel panel; private volatile Node currentNode = null; private IndexedText currentSource = null; + private FileTypeDetector fileTypeDetector = null; /** * Constructs a text viewer that displays the indexed text associated with a * file or an artifact, possibly marked up with HTML to highlight keyword - * hits. + * hits. If text for the Content has not been fully indexed by Solr then + * attempt to extract text using one of text extractors. */ public ExtractedTextViewer() { - // This constructor is intentionally empty. + try { + fileTypeDetector = new FileTypeDetector(); + } catch (FileTypeDetector.FileTypeDetectorInitException ex) { + logger.log(Level.SEVERE, "Failed to initialize FileTypeDetector", ex); //NON-NLS + } } /** @@ -155,8 +168,23 @@ public class ExtractedTextViewer implements TextViewer { */ IndexedText rawContentText = null; if (file != null) { - rawContentText = new RawText(file, file.getId()); - sources.add(rawContentText); + + // see if Solr has fully indexed this file + if (solrHasFullyIndexedContent(file.getId())) { + rawContentText = new SolrIndexedText(file, file.getId()); + sources.add(rawContentText); + } + + // Solr does not have fully indexed content. + // see if it's a file type for which we can extract text + if (ableToExtractTextFromFile(file)) { + try { + rawContentText = new ExtractedText(file, file.getId()); + sources.add(rawContentText); + } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { + // do nothing + } + } } /* @@ -164,7 +192,7 @@ public class ExtractedTextViewer implements TextViewer { * associated with the node. */ if (report != null) { - rawContentText = new RawText(report, report.getId()); + rawContentText = new SolrIndexedText(report, report.getId()); sources.add(rawContentText); } @@ -222,12 +250,11 @@ public class ExtractedTextViewer implements TextViewer { if (attribute != null) { long artifactId = attribute.getValueLong(); BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId); - rawArtifactText = new RawText(associatedArtifact, associatedArtifact.getArtifactID()); - + rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID()); } } else { - rawArtifactText = new RawText(artifact, artifact.getArtifactID()); + rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID()); } } return rawArtifactText; @@ -340,8 +367,18 @@ public class ExtractedTextViewer implements TextViewer { * data source instead of a file. */ AbstractFile file = node.getLookup().lookup(AbstractFile.class); - if (file != null && solrHasContent(file.getId())) { - return true; + if (file != null) { + + // see if Solr has fully indexed this file + if (solrHasFullyIndexedContent(file.getId())) { + return true; + } + + // Solr does not have fully indexed content. + // see if it's a file type for which we can extract text + if (ableToExtractTextFromFile(file)) { + return true; + } } /* @@ -351,7 +388,7 @@ public class ExtractedTextViewer implements TextViewer { * indexed text for the artifact. */ if (artifact != null) { - return solrHasContent(artifact.getArtifactID()); + return solrHasFullyIndexedContent(artifact.getArtifactID()); } /* @@ -361,7 +398,7 @@ public class ExtractedTextViewer implements TextViewer { */ Report report = node.getLookup().lookup(Report.class); if (report != null) { - return solrHasContent(report.getId()); + return solrHasFullyIndexedContent(report.getId()); } /* @@ -397,12 +434,14 @@ public class ExtractedTextViewer implements TextViewer { * * @return true if Solr has content, else false */ - private boolean solrHasContent(Long objectId) { + private boolean solrHasFullyIndexedContent(Long objectId) { final Server solrServer = KeywordSearch.getServer(); if (solrServer.coreIsOpen() == false) { return false; } + // ELTODO get total number of chunks in the file, and verify that + // all of the chunks have been indexed. try { return solrServer.queryIsIndexed(objectId); } catch (NoOpenCoreException | KeywordSearchModuleException ex) { @@ -411,6 +450,63 @@ public class ExtractedTextViewer implements TextViewer { } } + /** + * Check if we can extract text for this file type. + * + * @param file Abstract File + * + * @return true if text can be extracted from file, else false + */ + private boolean ableToExtractTextFromFile(AbstractFile file) { + + TskData.TSK_DB_FILES_TYPE_ENUM fileType = file.getType(); + + if (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) { + return false; + } + + /** + * Extract unicode strings from unallocated and unused blocks and carved + * text files. The reason for performing string extraction on these is + * because they all may contain multiple encodings which can cause text + * to be missed by the more specialized text extractors. + */ + if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) + || fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)) + || (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) { + return false; + } + + final long size = file.getSize(); + //if not to index content, or a dir, or 0 content, index meta data only + + if (file.isDir() || size == 0) { + return false; + } + + // ELTODO do we need to skip text files here? probably not. + if (file.getNameExtension().equalsIgnoreCase("txt")) { + return false; + } + + // ELTODO do we need to skip known files here? probably not. + if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) { + return false; + } + + String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase(); + + if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) { + return false; + } + + if (MimeTypes.OCTET_STREAM.equals(mimeType)) { + return false; + } + + return true; + } + /** * Listener to select the next match found in the text */ diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index e3f9582fdf..782f966616 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -96,7 +96,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { * generally text extractors should ignore archives and let unpacking * modules take care of them */ - private static final List ARCHIVE_MIME_TYPES + static final List ARCHIVE_MIME_TYPES = ImmutableList.of( //ignore unstructured binary and compressed data, for which string extraction or unzipper works better "application/x-7z-compressed", //NON-NLS @@ -683,7 +683,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { @NbBundle.Messages({ "KeywordSearchIngestModule.metadataTitle=METADATA" }) - private CharSource getMetaDataCharSource(Map metadata) { + static CharSource getMetaDataCharSource(Map metadata) { return CharSource.wrap(new StringBuilder( String.format("\n\n------------------------------%s------------------------------\n\n", Bundle.KeywordSearchIngestModule_metadataTitle())) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java similarity index 92% rename from KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java rename to KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java index 789de3fd50..6745e0c5d7 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/RawText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2018 Basis Technology Corp. + * Copyright 2011-2023 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -30,9 +30,9 @@ import org.sleuthkit.datamodel.TskData; /** * A "source" for the extracted content viewer that displays "raw" (not - * highlighted) indexed text for a file or an artifact. + * highlighted) Solr indexed text for a file or an artifact. */ -class RawText implements IndexedText { +class SolrIndexedText implements IndexedText { private int numPages = 0; private int currentPage = 0; @@ -43,12 +43,12 @@ class RawText implements IndexedText { //keep last content cached private String cachedString; private int cachedChunk; - private static final Logger logger = Logger.getLogger(RawText.class.getName()); + private static final Logger logger = Logger.getLogger(SolrIndexedText.class.getName()); /** - * Construct a new RawText object for the given content and object id. This + * Construct a new SolrIndexedText object for the given content and object id. This * constructor needs both a content object and an object id because the - * RawText implementation attempts to provide useful messages in the text + * SolrIndexedText implementation attempts to provide useful messages in the text * content viewer for (a) the case where a file has not been indexed because * known files are being skipped and (b) the case where the file content has * not yet been indexed. @@ -56,14 +56,14 @@ class RawText implements IndexedText { * @param content Used to get access to file names and "known" status. * @param objectId Either a file id or an artifact id. */ - RawText(Content content, long objectId) { + SolrIndexedText(Content content, long objectId) { this.content = content; this.blackboardArtifact = null; this.objectId = objectId; initialize(); } - RawText(BlackboardArtifact bba, long objectId) { + SolrIndexedText(BlackboardArtifact bba, long objectId) { this.content = null; this.blackboardArtifact = bba; this.objectId = objectId; @@ -159,14 +159,14 @@ class RawText implements IndexedText { } @NbBundle.Messages({ - "RawText.FileText=File Text", - "RawText.ResultText=Result Text"}) + "SolrIndexedText.FileText=File Text", + "SolrIndexedText.ResultText=Result Text"}) @Override public String toString() { if (null != content) { - return Bundle.RawText_FileText(); + return Bundle.SolrIndexedText_FileText(); } else { - return Bundle.RawText_ResultText(); + return Bundle.SolrIndexedText_ResultText(); } } From 8494453a0965a22b7f15852d746bb6d4b13f1fe9 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Tue, 6 Jun 2023 12:43:45 -0400 Subject: [PATCH 2/9] More work --- .../textcontentviewer/TextContentViewer.java | 1 - .../autopsy/keywordsearch/Bundle.properties | 2 +- .../keywordsearch/ExtractedContentPanel.java | 17 ++++++--- .../autopsy/keywordsearch/ExtractedText.java | 37 ++++++------------- .../keywordsearch/ExtractedTextViewer.java | 24 ++++++------ .../keywordsearch/SolrIndexedText.java | 22 ++--------- 6 files changed, 40 insertions(+), 63 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewer.java b/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewer.java index 426163ef80..9221f24c72 100644 --- a/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewer.java +++ b/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewer.java @@ -24,7 +24,6 @@ import org.openide.util.NbBundle.Messages; import org.openide.util.lookup.ServiceProvider; import org.sleuthkit.autopsy.corecomponentinterfaces.DataContentViewer; import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.BlackboardArtifact; /** * A DataContentViewer that displays text with the TextViewers available. diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties index e34155fcd5..565804dea4 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties @@ -42,7 +42,7 @@ AbstractKeywordSearchPerformer.search.emptyKeywordErrorBody=Keyword list is empt AbstractKeywordSearchPerformer.search.noFilesInIdxMsg=No files are in index yet.
If Solr keyword search indexing was enabled, wait for ingest to complete AbstractKeywordSearchPerformer.search.noFilesIdxdMsg=No files were indexed.
Re-ingest the image with the Keyword Search Module and Solr indexing enabled. ExtractedContentViewer.toolTip=Displays extracted text from files and keyword-search results. Requires Keyword Search ingest to be run on a file to activate this viewer. -ExtractedContentViewer.getTitle=Indexed Text +ExtractedContentViewer.getTitle=Extracted Text HighlightedMatchesSource.toString=Search Results Installer.reportPortError=Indexing server port {0} is not available. Check if your security software does not block {1} and consider changing {2} in {3} property file in the application user folder. Then try rebooting your system if another process was causing the conflict. Installer.reportStopPortError=Indexing server stop port {0} is not available. Consider changing {1} in {2} property file in the application user folder. diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java index c64cee2565..d4e3975b97 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java @@ -556,7 +556,11 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * @param total total number of pages to update the display with */ void updateTotalPagesDisplay(int total) { - pageTotalLabel.setText(Integer.toString(total)); + if (total >= 0) { + pageTotalLabel.setText(Integer.toString(total)); + } else { + pageTotalLabel.setText("-"); + } } /** @@ -655,13 +659,14 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP int totalPages = source.getNumberPages(); updateTotalPagesDisplay(totalPages); - if (totalPages < 2) { - enableNextPageControl(false); - enablePrevPageControl(false); - } else { + // ELTODO + //if (totalPages < 2) { + // enableNextPageControl(false); + // enablePrevPageControl(false); + //} else { enableNextPageControl(source.hasNextPage()); enablePrevPageControl(source.hasPreviousPage()); - } + //} } /** diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java index c12b34e93b..e42f878b70 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java @@ -32,19 +32,16 @@ import org.sleuthkit.autopsy.textextractors.TextExtractor; import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; import org.sleuthkit.datamodel.AbstractFile; -/** +/** ELTODO * A "source" for the extracted abstractFile viewer that displays "raw" (not * highlighted) indexed text for a file or an artifact. */ -class ExtractedText implements IndexedText { // ELTODO +class ExtractedText implements IndexedText { private int numPages = 0; private int currentPage = 0; private final AbstractFile abstractFile; private final long objectId; - //keep last abstractFile cached - private String cachedString; - private int cachedChunk; private Chunker chunker = null; private static final Logger logger = Logger.getLogger(ExtractedText.class.getName()); @@ -62,8 +59,7 @@ class ExtractedText implements IndexedText { // ELTODO ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { this.abstractFile = file; this.objectId = objectId; - this.currentPage = 0; // ELTODO - this.numPages = 1; + this.numPages = -1; // We don't know how many pages there are until we reach end of the document initialize(); } @@ -83,7 +79,10 @@ class ExtractedText implements IndexedText { // ELTODO @Override public boolean hasNextPage() { - return true; + if (chunker.hasNext()) { + return true; + } + return false; } @Override @@ -144,7 +143,7 @@ class ExtractedText implements IndexedText { // ELTODO @Override public String getText() { try { - return getContentText(currentPage + 1); // ELTODO + return getContentText(currentPage); } catch (Exception ex) { logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS } @@ -201,15 +200,6 @@ class ExtractedText implements IndexedText { // ELTODO * @return the extracted text */ private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception { - - // ELTODO - //check if cached - if (cachedString != null) { - if (cachedChunk == currentPage) { - return cachedString; - } - } - String indexedText; if (chunker.hasNext()) { Chunker.Chunk chunk = chunker.next(); @@ -225,13 +215,10 @@ class ExtractedText implements IndexedText { // ELTODO return Bundle.IndexedText_errorMessage_errorGettingText(); } - cachedString = EscapeUtil.escapeHtml(indexedText).trim(); - StringBuilder sb = new StringBuilder(cachedString.length() + 20); - sb.append("
").append(cachedString).append("
"); //NON-NLS - cachedString = sb.toString(); - cachedChunk = currentPage; - - return cachedString; + indexedText = EscapeUtil.escapeHtml(indexedText).trim(); + StringBuilder sb = new StringBuilder(indexedText.length() + 20); + sb.append("
").append(indexedText).append("
"); //NON-NLS + return sb.toString(); } private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile, diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index a33105e4fc..e715f73e0d 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -173,17 +173,17 @@ public class ExtractedTextViewer implements TextViewer { if (solrHasFullyIndexedContent(file.getId())) { rawContentText = new SolrIndexedText(file, file.getId()); sources.add(rawContentText); - } - - // Solr does not have fully indexed content. - // see if it's a file type for which we can extract text - if (ableToExtractTextFromFile(file)) { - try { - rawContentText = new ExtractedText(file, file.getId()); - sources.add(rawContentText); - } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { - // do nothing - } + } else { + // Solr does not have fully indexed content. + // see if it's a file type for which we can extract text + if (ableToExtractTextFromFile(file)) { + try { + rawContentText = new ExtractedText(file, file.getId()); + sources.add(rawContentText); + } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { + // do nothing + } + } } } @@ -501,7 +501,7 @@ public class ExtractedTextViewer implements TextViewer { } if (MimeTypes.OCTET_STREAM.equals(mimeType)) { - return false; + // ELTODO return false; } return true; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java index 6745e0c5d7..a500c5ef48 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java @@ -40,9 +40,6 @@ class SolrIndexedText implements IndexedText { private final Content content; private final BlackboardArtifact blackboardArtifact; private final long objectId; - //keep last content cached - private String cachedString; - private int cachedChunk; private static final Logger logger = Logger.getLogger(SolrIndexedText.class.getName()); /** @@ -249,14 +246,6 @@ class SolrIndexedText implements IndexedText { } int chunkId = currentPage; - - //check if cached - if (cachedString != null) { - if (cachedChunk == chunkId) { - return cachedString; - } - } - //not cached String indexedText = solrServer.getSolrContent(this.objectId, chunkId); if (indexedText == null) { @@ -269,13 +258,10 @@ class SolrIndexedText implements IndexedText { return Bundle.IndexedText_warningMessage_noTextAvailable(); } - cachedString = EscapeUtil.escapeHtml(indexedText).trim(); - StringBuilder sb = new StringBuilder(cachedString.length() + 20); - sb.append("
").append(cachedString).append("
"); //NON-NLS - cachedString = sb.toString(); - cachedChunk = chunkId; - - return cachedString; + indexedText = EscapeUtil.escapeHtml(indexedText).trim(); + StringBuilder sb = new StringBuilder(indexedText.length() + 20); + sb.append("
").append(indexedText).append("
"); //NON-NLS + return sb.toString(); } /** From e3da0cae14f070ef820f1595346e88451efdc9c4 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Tue, 6 Jun 2023 14:14:37 -0400 Subject: [PATCH 3/9] More work --- .../keywordsearch/ExtractedContentPanel.java | 12 ++--- .../autopsy/keywordsearch/ExtractedText.java | 46 ++++++------------- .../keywordsearch/ExtractedTextViewer.java | 30 +++--------- .../KeywordSearchIngestModule.java | 1 - 4 files changed, 22 insertions(+), 67 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java index d4e3975b97..41d384d559 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2021 Basis Technology Corp. + * Copyright 2011-2023 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -659,14 +659,8 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP int totalPages = source.getNumberPages(); updateTotalPagesDisplay(totalPages); - // ELTODO - //if (totalPages < 2) { - // enableNextPageControl(false); - // enablePrevPageControl(false); - //} else { - enableNextPageControl(source.hasNextPage()); - enablePrevPageControl(source.hasPreviousPage()); - //} + enableNextPageControl(source.hasNextPage()); + enablePrevPageControl(source.hasPreviousPage()); } /** diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java index e42f878b70..edb8641b29 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java @@ -32,46 +32,33 @@ import org.sleuthkit.autopsy.textextractors.TextExtractor; import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; import org.sleuthkit.datamodel.AbstractFile; -/** ELTODO - * A "source" for the extracted abstractFile viewer that displays "raw" (not - * highlighted) indexed text for a file or an artifact. +/** + * A "source" for abstractFile viewer that displays "raw" extracted text for a + * file. Only supports file types for which there are text extractors. Uses + * chunking algorithm used by KeywordSearchIngestModule. The readers used in + * chunking don't have ability to go backwards or to fast forward to a specific + * offset. Therefore there is no way to scroll pages back, or to determine how + * many total pages there are. */ class ExtractedText implements IndexedText { private int numPages = 0; private int currentPage = 0; private final AbstractFile abstractFile; - private final long objectId; private Chunker chunker = null; private static final Logger logger = Logger.getLogger(ExtractedText.class.getName()); /** - * Construct a new ExtractedText object for the given content and object id. - * This constructor needs both a content object and an object id because the - * ExtractedText implementation attempts to provide useful messages in the - * text content viewer for (a) the case where a file has not been indexed - * because known files are being skipped and (b) the case where the file - * content has not yet been indexed. + * Construct a new ExtractedText object for the given abstract file. * - * @param file Abstract file. - * @param objectId Either a file id or an artifact id. + * @param file Abstract file. */ - ExtractedText(AbstractFile file, long objectId) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { + ExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { this.abstractFile = file; - this.objectId = objectId; this.numPages = -1; // We don't know how many pages there are until we reach end of the document initialize(); } - /** - * Return the ID that this object is associated with -- to help with caching - * - * @return - */ - public long getObjectId() { - return this.objectId; - } - @Override public int getCurrentPage() { return this.currentPage; @@ -177,9 +164,6 @@ class ExtractedText implements IndexedText { return numPages; } - /** - * Set the internal values, such as pages - */ private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); @@ -194,7 +178,6 @@ class ExtractedText implements IndexedText { /** * Extract text from abstractFile * - * @param node a node that has extracted abstractFile * @param currentPage currently used page * * @return the extracted text @@ -209,7 +192,7 @@ class ExtractedText implements IndexedText { logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException()); throw chunker.getException(); } - + indexedText = chunk.toString(); } else { return Bundle.IndexedText_errorMessage_errorGettingText(); @@ -229,9 +212,7 @@ class ExtractedText implements IndexedText { try { Map metadata = extractor.getMetadata(); if (!metadata.isEmpty()) { - // Creating the metadata artifact here causes occasional problems - // when indexing the text, so we save the metadata map to - // use after this method is complete. + // save the metadata map to use after this method is complete. extractedMetadata.putAll(metadata); } CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata); @@ -249,9 +230,8 @@ class ExtractedText implements IndexedText { //Just send file text. finalReader = fileText; } - //divide into chunks and index + //divide into chunks return finalReader; - } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index e715f73e0d..6047c2db60 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2019 Basis Technology Corp. + * Copyright 2011-2023 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,7 +28,6 @@ import java.util.List; import java.util.logging.Level; import org.apache.tika.mime.MimeTypes; import org.openide.nodes.Node; -import org.openide.util.Exceptions; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.lookup.ServiceProvider; @@ -36,7 +35,6 @@ import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException; import org.sleuthkit.autopsy.corecomponentinterfaces.TextViewer; import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.ingest.IngestModule; import org.sleuthkit.autopsy.keywordsearch.AdHocSearchChildFactory.AdHocQueryResult; import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector; import org.sleuthkit.autopsy.textextractors.TextExtractor; @@ -178,7 +176,7 @@ public class ExtractedTextViewer implements TextViewer { // see if it's a file type for which we can extract text if (ableToExtractTextFromFile(file)) { try { - rawContentText = new ExtractedText(file, file.getId()); + rawContentText = new ExtractedText(file); sources.add(rawContentText); } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { // do nothing @@ -451,7 +449,9 @@ public class ExtractedTextViewer implements TextViewer { } /** - * Check if we can extract text for this file type. + * Check if we can extract text for this file type using one of our text extractors. + * NOTE: the logic in this method should be similar and based on the + * logic of how KeywordSearchIngestModule decides which files to index. * * @param file Abstract File * @@ -465,12 +465,6 @@ public class ExtractedTextViewer implements TextViewer { return false; } - /** - * Extract unicode strings from unallocated and unused blocks and carved - * text files. The reason for performing string extraction on these is - * because they all may contain multiple encodings which can cause text - * to be missed by the more specialized text extractors. - */ if ((fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)) || (fileType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED))) { @@ -478,22 +472,10 @@ public class ExtractedTextViewer implements TextViewer { } final long size = file.getSize(); - //if not to index content, or a dir, or 0 content, index meta data only - if (file.isDir() || size == 0) { return false; } - // ELTODO do we need to skip text files here? probably not. - if (file.getNameExtension().equalsIgnoreCase("txt")) { - return false; - } - - // ELTODO do we need to skip known files here? probably not. - if (KeywordSearchSettings.getSkipKnown() && file.getKnown().equals(TskData.FileKnown.KNOWN)) { - return false; - } - String mimeType = fileTypeDetector.getMIMEType(file).trim().toLowerCase(); if (KeywordSearchIngestModule.ARCHIVE_MIME_TYPES.contains(mimeType)) { @@ -501,7 +483,7 @@ public class ExtractedTextViewer implements TextViewer { } if (MimeTypes.OCTET_STREAM.equals(mimeType)) { - // ELTODO return false; + return false; } return true; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index 782f966616..cd6f255def 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -38,7 +38,6 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.stream.Collectors; import org.apache.tika.mime.MimeTypes; -import org.openide.util.Exceptions; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.NbBundle.Messages; From cd83205382ecf62ca0087964b33f24aafff77ebc Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Tue, 6 Jun 2023 16:02:27 -0400 Subject: [PATCH 4/9] Improvements and fixes and caching --- .../TextContentViewerPanel.java | 2 +- .../autopsy/keywordsearch/ExtractedText.java | 21 ++++---- .../keywordsearch/ExtractedTextViewer.java | 29 ++++++++--- .../autopsy/keywordsearch/Server.java | 49 ++++++++++++++----- 4 files changed, 71 insertions(+), 30 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewerPanel.java b/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewerPanel.java index 1bc3339867..781dabc867 100644 --- a/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewerPanel.java +++ b/Core/src/org/sleuthkit/autopsy/contentviewers/textcontentviewer/TextContentViewerPanel.java @@ -99,7 +99,7 @@ public class TextContentViewerPanel extends javax.swing.JPanel implements DataCo /** * Determine the isPreffered score for the content viewer which is - * displaying this panel. Score is depenedent on the score of the supported + * displaying this panel. Score is dependent on the score of the supported * TextViewers which exist. * * @param node diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java index edb8641b29..c0e4e5f6c4 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java @@ -56,7 +56,15 @@ class ExtractedText implements IndexedText { ExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { this.abstractFile = file; this.numPages = -1; // We don't know how many pages there are until we reach end of the document - initialize(); + + TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); + + Map extractedMetadata = new HashMap<>(); + Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata); + + //Get a reader for the content of the given source + BufferedReader reader = new BufferedReader(sourceReader); + this.chunker = new Chunker(reader); } @Override @@ -164,17 +172,6 @@ class ExtractedText implements IndexedText { return numPages; } - private void initialize() throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { - TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); - - Map extractedMetadata = new HashMap<>(); - Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata); - - //Get a reader for the content of the given source - BufferedReader reader = new BufferedReader(sourceReader); - chunker = new Chunker(reader); - } - /** * Extract text from abstractFile * diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index 6047c2db60..3f28c97d25 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -67,6 +67,9 @@ public class ExtractedTextViewer implements TextViewer { private volatile Node currentNode = null; private IndexedText currentSource = null; private FileTypeDetector fileTypeDetector = null; + + private long cachedObjId = -1; + private boolean chachedIsFullyIndexed = false; /** * Constructs a text viewer that displays the indexed text associated with a @@ -426,25 +429,39 @@ public class ExtractedTextViewer implements TextViewer { } /** - * Check if Solr has extracted content for a given node + * Check if Solr has indexed ALL of the content for a given node. Note that + * in some situations Solr only indexes parts of a file. This happens when + * an in-line KWS finds a KW hit in the file - only the chunks with the KW + * hit (+/- 1 chunk) get indexed by Solr. That is not enough for the + * purposes of this text viewer as we need to display all of the text in the + * file. * * @param objectId * * @return true if Solr has content, else false */ private boolean solrHasFullyIndexedContent(Long objectId) { + + // check if we have cached this decision + if (objectId == cachedObjId) { + return chachedIsFullyIndexed; + } + + cachedObjId = objectId; final Server solrServer = KeywordSearch.getServer(); if (solrServer.coreIsOpen() == false) { - return false; + chachedIsFullyIndexed = false; + return chachedIsFullyIndexed; } - // ELTODO get total number of chunks in the file, and verify that - // all of the chunks have been indexed. + // verify that all of the chunks in the file have been indexed. try { - return solrServer.queryIsIndexed(objectId); + chachedIsFullyIndexed = solrServer.queryIsFullyIndexed(objectId); + return chachedIsFullyIndexed; } catch (NoOpenCoreException | KeywordSearchModuleException ex) { logger.log(Level.SEVERE, "Error querying Solr server", ex); //NON-NLS - return false; + chachedIsFullyIndexed = false; + return chachedIsFullyIndexed; } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java index d54d6964dd..2a580c4a6a 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java @@ -1635,23 +1635,29 @@ public class Server { } /** - * Return true if the file is indexed (either as a whole as a chunk) + * Return true if the file is fully indexed (no chunks are missing) * * @param contentID * - * @return true if it is indexed + * @return true if it is fully indexed * * @throws KeywordSearchModuleException * @throws NoOpenCoreException */ - public boolean queryIsIndexed(long contentID) throws KeywordSearchModuleException, NoOpenCoreException { + public boolean queryIsFullyIndexed(long contentID) throws KeywordSearchModuleException, NoOpenCoreException { currentCoreLock.readLock().lock(); try { if (null == currentCollection) { throw new NoOpenCoreException(); } try { - return currentCollection.queryIsIndexed(contentID); + int totalNumChunks = currentCollection.queryTotalNumFileChunks(contentID); + if (totalNumChunks == 0) { + return false; + } + + int numIndexedChunks = currentCollection.queryNumIndexedChunks(contentID); + return numIndexedChunks == totalNumChunks; } catch (Exception ex) { // intentional "catch all" as Solr is known to throw all kinds of Runtime exceptions throw new KeywordSearchModuleException(NbBundle.getMessage(this.getClass(), "Server.queryIsIdxd.exception.msg"), ex); @@ -1680,7 +1686,7 @@ public class Server { throw new NoOpenCoreException(); } try { - return currentCollection.queryNumFileChunks(fileID); + return currentCollection.queryTotalNumFileChunks(fileID); } catch (Exception ex) { // intentional "catch all" as Solr is known to throw all kinds of Runtime exceptions throw new KeywordSearchModuleException(NbBundle.getMessage(this.getClass(), "Server.queryNumFileChunks.exception.msg"), ex); @@ -2484,7 +2490,7 @@ public class Server { } /** - * Return true if the file is indexed (either as a whole as a chunk) + * Return true if the file is indexed (either as a whole or as a chunk) * * @param contentID * @@ -2502,17 +2508,20 @@ public class Server { } /** - * Execute query that gets number of indexed file chunks for a file + * Execute query that gets total number of file chunks for a file. NOTE: + * this does not imply that all of the chunks have been indexed. This + * parameter simply stores the total number of chunks that the file had + * (as determined during chunking). * * @param contentID file id of the original file broken into chunks and - * indexed + * indexed * - * @return int representing number of indexed file chunks, 0 if there is - * no chunks + * @return int representing number of file chunks, 0 if there is no + * chunks * * @throws SolrServerException */ - private int queryNumFileChunks(long contentID) throws SolrServerException, IOException { + private int queryTotalNumFileChunks(long contentID) throws SolrServerException, IOException { final SolrQuery q = new SolrQuery(); q.setQuery("*:*"); String filterQuery = Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(Long.toString(contentID)); @@ -2537,6 +2546,24 @@ public class Server { logger.log(Level.SEVERE, "Error getting content from Solr. Solr document id " + contentID + ", query: " + filterQuery); //NON-NLS return 0; } + + /** + * Execute query that gets number of indexed chunks for a specific Solr + * document, without actually returning the content. + * + * @param contentID file id of the original file broken into chunks and + * indexed + * + * @return int representing number of indexed chunks + * + * @throws SolrServerException + */ + int queryNumIndexedChunks(long contentID) throws SolrServerException, IOException { + SolrQuery q = new SolrQuery(Server.Schema.ID + ":" + contentID + Server.CHUNK_ID_SEPARATOR + "*"); + q.setRows(0); + int numChunks = (int) query(q).getResults().getNumFound(); + return numChunks; + } } class ServerAction extends AbstractAction { From 9ebb3cfa002d0dcc2c9e54b7c7e38d44a40f0999 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Wed, 7 Jun 2023 15:18:17 -0400 Subject: [PATCH 5/9] Multiple bug fixes and improvements --- .../keywordsearch/ExtractedTextViewer.java | 79 +++++++++++++++---- .../autopsy/keywordsearch/Server.java | 8 +- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index 3f28c97d25..3471e4d055 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -24,7 +24,9 @@ import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.util.ArrayList; import java.util.Collection; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.logging.Level; import org.apache.tika.mime.MimeTypes; import org.openide.nodes.Node; @@ -68,8 +70,9 @@ public class ExtractedTextViewer implements TextViewer { private IndexedText currentSource = null; private FileTypeDetector fileTypeDetector = null; - private long cachedObjId = -1; - private boolean chachedIsFullyIndexed = false; + // cache of last 10 solrHasFullyIndexedContent() requests sent to Solr. + private SolrIsFullyIndexedCache solrCache = null; + // ELTODO clear the cache when case closes /** * Constructs a text viewer that displays the indexed text associated with a @@ -83,6 +86,8 @@ public class ExtractedTextViewer implements TextViewer { } catch (FileTypeDetector.FileTypeDetectorInitException ex) { logger.log(Level.SEVERE, "Failed to initialize FileTypeDetector", ex); //NON-NLS } + + solrCache = new SolrIsFullyIndexedCache(); } /** @@ -193,8 +198,11 @@ public class ExtractedTextViewer implements TextViewer { * associated with the node. */ if (report != null) { - rawContentText = new SolrIndexedText(report, report.getId()); - sources.add(rawContentText); + // see if Solr has fully indexed this file + if (solrHasFullyIndexedContent(report.getId())) { + rawContentText = new SolrIndexedText(report, report.getId()); + sources.add(rawContentText); + } } /* @@ -237,7 +245,7 @@ public class ExtractedTextViewer implements TextViewer { } - static private IndexedText getRawArtifactText(BlackboardArtifact artifact) throws TskCoreException, NoCurrentCaseException { + private IndexedText getRawArtifactText(BlackboardArtifact artifact) throws TskCoreException, NoCurrentCaseException { IndexedText rawArtifactText = null; if (null != artifact) { /* @@ -251,11 +259,15 @@ public class ExtractedTextViewer implements TextViewer { if (attribute != null) { long artifactId = attribute.getValueLong(); BlackboardArtifact associatedArtifact = Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboardArtifact(artifactId); - rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID()); + if (solrHasFullyIndexedContent(associatedArtifact.getArtifactID())) { + rawArtifactText = new SolrIndexedText(associatedArtifact, associatedArtifact.getArtifactID()); + } } } else { - rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID()); + if (solrHasFullyIndexedContent(artifact.getArtifactID())) { + rawArtifactText = new SolrIndexedText(artifact, artifact.getArtifactID()); + } } } return rawArtifactText; @@ -443,25 +455,25 @@ public class ExtractedTextViewer implements TextViewer { private boolean solrHasFullyIndexedContent(Long objectId) { // check if we have cached this decision - if (objectId == cachedObjId) { - return chachedIsFullyIndexed; + if (solrCache.containsKey(objectId)) { + return solrCache.getCombination(objectId); } - cachedObjId = objectId; final Server solrServer = KeywordSearch.getServer(); if (solrServer.coreIsOpen() == false) { - chachedIsFullyIndexed = false; - return chachedIsFullyIndexed; + solrCache.putCombination(objectId, false); + return false; } // verify that all of the chunks in the file have been indexed. try { - chachedIsFullyIndexed = solrServer.queryIsFullyIndexed(objectId); - return chachedIsFullyIndexed; + boolean isFullyIndexed = solrServer.queryIsFullyIndexed(objectId); + solrCache.putCombination(objectId, isFullyIndexed); + return isFullyIndexed; } catch (NoOpenCoreException | KeywordSearchModuleException ex) { logger.log(Level.SEVERE, "Error querying Solr server", ex); //NON-NLS - chachedIsFullyIndexed = false; - return chachedIsFullyIndexed; + solrCache.putCombination(objectId, false); + return false; } } @@ -693,4 +705,39 @@ public class ExtractedTextViewer implements TextViewer { previousPage(); } } + + /** + * This class maintains a cache of last 10 solrHasFullyIndexedContent() + * requests sent to Solr. + */ + private class SolrIsFullyIndexedCache { + + private static final int CACHE_SIZE = 10; + private final LinkedHashMap cache; + + private SolrIsFullyIndexedCache() { + this.cache = new LinkedHashMap(CACHE_SIZE, 0.75f, true) { + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > CACHE_SIZE; + } + }; + } + + public void putCombination(long key, boolean value) { + cache.put(key, value); + } + + public Boolean getCombination(long key) { + return cache.get(key); + } + + public void clearCache() { + cache.clear(); + } + + public boolean containsKey(long key) { + return cache.containsKey(key); + } + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java index 2a580c4a6a..9d04595f09 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Server.java @@ -2531,7 +2531,7 @@ public class Server { SolrDocumentList solrDocuments = query(q).getResults(); if (!solrDocuments.isEmpty()) { SolrDocument solrDocument = solrDocuments.get(0); - if (solrDocument != null) { + if (solrDocument != null && !solrDocument.isEmpty()) { Object fieldValue = solrDocument.getFieldValue(Schema.NUM_CHUNKS.toString()); return (Integer)fieldValue; } @@ -2541,9 +2541,7 @@ public class Server { logger.log(Level.SEVERE, "Error getting content from Solr. Solr document id " + contentID + ", query: " + filterQuery, ex); //NON-NLS return 0; } - - // ERROR: we should never get here - logger.log(Level.SEVERE, "Error getting content from Solr. Solr document id " + contentID + ", query: " + filterQuery); //NON-NLS + // File not indexed return 0; } @@ -2559,7 +2557,7 @@ public class Server { * @throws SolrServerException */ int queryNumIndexedChunks(long contentID) throws SolrServerException, IOException { - SolrQuery q = new SolrQuery(Server.Schema.ID + ":" + contentID + Server.CHUNK_ID_SEPARATOR + "*"); + SolrQuery q = new SolrQuery(Server.Schema.ID + ":" + KeywordSearchUtil.escapeLuceneQuery(Long.toString(contentID)) + Server.CHUNK_ID_SEPARATOR + "*"); q.setRows(0); int numChunks = (int) query(q).getResults().getNumFound(); return numChunks; From 3757ca606da1b786c475760eff5e155f44989140 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Wed, 7 Jun 2023 15:22:35 -0400 Subject: [PATCH 6/9] Minor --- .../org/sleuthkit/autopsy/keywordsearch/IndexedText.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java index 17366483e3..880d4c8a0c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2018 Basis Technology Corp. + * Copyright 2011-2023 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -24,12 +24,13 @@ import org.openide.util.NbBundle; * Interface to provide HTML text to display in ExtractedContentViewer. There is * a SOLR implementation of this that interfaces with SOLR to highlight the * keyword hits and a version that does not do markup so that you can simply - * view the stored text. + * view the stored text. There is also an implementation that extracts text from + * a file using one os TextExtractors. */ @NbBundle.Messages({ - "IndexedText.errorMessage.errorGettingText=Error retrieving indexed text.", + "IndexedText.errorMessage.errorGettingText=Error retrieving text.", "IndexedText.warningMessage.knownFile=This file is a known file (based on MD5 hash) and does not have indexed text.", - "IndexedText.warningMessage.noTextAvailable=No indexed text for this file." + "IndexedText.warningMessage.noTextAvailable=No text available for this file." }) interface IndexedText { From 318917bcc65b386dd7d49a249a8e38ad892d958f Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Wed, 7 Jun 2023 15:37:30 -0400 Subject: [PATCH 7/9] Renamed interface and classes to more appropriate name --- .../autopsy/keywordsearch/AccountsText.java | 4 +- .../keywordsearch/Bundle.properties-MERGED | 18 +- .../keywordsearch/ExtractedContentPanel.form | 6 +- .../keywordsearch/ExtractedContentPanel.java | 30 +- .../autopsy/keywordsearch/ExtractedText.java | 306 +++++++----------- .../keywordsearch/ExtractedTextViewer.java | 32 +- .../FileReaderExtractedText.java | 234 ++++++++++++++ .../keywordsearch/HighlightedText.java | 6 +- .../autopsy/keywordsearch/IndexedText.java | 148 --------- .../keywordsearch/SolrIndexedText.java | 16 +- 10 files changed, 402 insertions(+), 398 deletions(-) mode change 100755 => 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileReaderExtractedText.java delete mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java index 00d5670151..73a824a942 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AccountsText.java @@ -50,7 +50,7 @@ import org.sleuthkit.datamodel.TskCoreException; * and HighlightedText are very similar and could probably use some refactoring * to reduce code duplication. */ -class AccountsText implements IndexedText { +class AccountsText implements ExtractedText { private static final Logger logger = Logger.getLogger(AccountsText.class.getName()); private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT); @@ -312,7 +312,7 @@ class AccountsText implements IndexedText { return "
" + highlightedText + "
"; //NON-NLS } catch (Exception ex) { logger.log(Level.SEVERE, "Error getting highlighted text for Solr doc id " + this.solrObjectId + ", chunkID " + this.currentPage, ex); //NON-NLS - return Bundle.IndexedText_errorMessage_errorGettingText(); + return Bundle.ExtractedText_errorMessage_errorGettingText(); } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED index c05fd15c02..2ff50a4048 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties-MERGED @@ -15,6 +15,7 @@ ExtractAllTermsReport.error.noOpenCase=No currently open case. ExtractAllTermsReport.export.error=Error During Unique Word Extraction ExtractAllTermsReport.exportComplete=Unique Word Extraction Complete ExtractAllTermsReport.getName.text=Extract Unique Words +# {0} - Number of extracted terms ExtractAllTermsReport.numberExtractedTerms=Extracted {0} terms... ExtractAllTermsReport.search.ingestInProgressBody=Keyword Search Ingest is currently running.
Not all files have been indexed and unique word extraction might yield incomplete results.
Do you want to proceed with unique word extraction anyway? ExtractAllTermsReport.search.noFilesInIdxMsg=No files are in index yet. If Solr keyword search indexing and Solr indexing were enabled, wait for ingest to complete. @@ -22,13 +23,15 @@ ExtractAllTermsReport.search.noFilesInIdxMsg2=No files are in index yet. Re-inge ExtractAllTermsReport.search.searchIngestInProgressTitle=Keyword Search Ingest in Progress ExtractAllTermsReport.startExport=Starting Unique Word Extraction ExtractedContentPanel.setMarkup.panelTxt=Loading text... Please wait +# {0} - Content name ExtractedContentPanel.SetMarkup.progress.loading=Loading text for {0} +ExtractedText.errorMessage.errorGettingText=Error retrieving text. +ExtractedText.FileText=File Text +ExtractedText.warningMessage.knownFile=This file is a known file (based on MD5 hash) and does not have indexed text. +ExtractedText.warningMessage.noTextAvailable=No text available for this file. GlobalEditListPanel.editKeyword.title=Edit Keyword GlobalEditListPanel.warning.text=Boundary characters ^ and $ do not match word boundaries. Consider\nreplacing with an explicit list of boundary characters, such as [ \\.,] GlobalEditListPanel.warning.title=Warning -IndexedText.errorMessage.errorGettingText=Error retrieving indexed text. -IndexedText.warningMessage.knownFile=This file is a known file (based on MD5 hash) and does not have indexed text. -IndexedText.warningMessage.noTextAvailable=No indexed text for this file. KeywordSearchGlobalSearchSettingsPanel.customizeComponents.windowsLimitedOCR=Only process images which are over 100KB in size or extracted from a document. (Beta) (Requires Windows 64-bit) KeywordSearchGlobalSearchSettingsPanel.customizeComponents.windowsOCR=Enable Optical Character Recognition (OCR) (Requires Windows 64-bit) KeywordSearchGlobalSettingsPanel.Title=Global Keyword Search Settings @@ -49,7 +52,7 @@ KeywordSearchResultFactory.createNodeForKey.noResultsFound.text=No results found KeywordSearchResultFactory.query.exception.msg=Could not perform the query OpenIDE-Module-Display-Category=Ingest Module -OpenIDE-Module-Long-Description=Keyword Search ingest module.\n\nThe module indexes files found in the disk image at ingest time.\nIt then periodically runs the search on the indexed files using one or more keyword lists (containing pure words and/or regular expressions) and posts results.\n\nThe module also contains additional tools integrated in the main GUI, such as keyword list configuration, keyword search bar in the top-right corner, extracted text viewer and search results viewer showing highlighted keywords found. +OpenIDE-Module-Long-Description=Keyword Search ingest module.\n\nThe module indexes files found in the disk image at ingest time.\nIt then periodically runs the search on the indexed files using one or more keyword lists (containing pure words and/or regular expressions) and posts results.\n\n\The module also contains additional tools integrated in the main GUI, such as keyword list configuration, keyword search bar in the top-right corner, extracted text viewer and search results viewer showing highlighted keywords found. OpenIDE-Module-Name=KeywordSearch OptionsCategory_Name_KeywordSearchOptions=Keyword Search OptionsCategory_Keywords_KeywordSearchOptions=Keyword Search @@ -91,7 +94,7 @@ AbstractKeywordSearchPerformer.search.emptyKeywordErrorBody=Keyword list is empt AbstractKeywordSearchPerformer.search.noFilesInIdxMsg=No files are in index yet.
If Solr keyword search indexing was enabled, wait for ingest to complete AbstractKeywordSearchPerformer.search.noFilesIdxdMsg=No files were indexed.
Re-ingest the image with the Keyword Search Module and Solr indexing enabled. ExtractedContentViewer.toolTip=Displays extracted text from files and keyword-search results. Requires Keyword Search ingest to be run on a file to activate this viewer. -ExtractedContentViewer.getTitle=Indexed Text +ExtractedContentViewer.getTitle=Extracted Text HighlightedMatchesSource.toString=Search Results Installer.reportPortError=Indexing server port {0} is not available. Check if your security software does not block {1} and consider changing {2} in {3} property file in the application user folder. Then try rebooting your system if another process was causing the conflict. Installer.reportStopPortError=Indexing server stop port {0} is not available. Consider changing {1} in {2} property file in the application user folder. @@ -137,8 +140,6 @@ KeywordSearchIngestModule.init.onlyIdxKwSkipMsg=Only indexing will be done and k KeywordSearchIngestModule.doInBackGround.displayName=Periodic Keyword Search KeywordSearchIngestModule.doInBackGround.finalizeMsg=Finalizing KeywordSearchIngestModule.doInBackGround.pendingMsg=(Pending) -RawText.FileText=File Text -RawText.ResultText=Result Text SearchRunner.doInBackGround.cancelMsg=(Cancelling...) KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl=Files with known types KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead=Files with general strings extracted @@ -224,6 +225,7 @@ KeywordSearchSettings.properties_options.text={0}_Options KeywordSearchSettings.propertiesNSRL.text={0}_NSRL KeywordSearchSettings.propertiesScripts.text={0}_Scripts NoOpenCoreException.err.noOpenSorlCore.msg=No currently open Solr core. +# {0} - colelction name Server.deleteCore.exception.msg=Failed to delete Solr colelction {0} Server.exceptionMessage.unableToBackupCollection=Unable to backup Solr collection Server.exceptionMessage.unableToCreateCollection=Unable to create Solr collection @@ -336,6 +338,8 @@ GlobalListsManagementPanel.copyListButton.text=Copy List GlobalListsManagementPanel.renameListButton.text=Edit List Name GlobalEditListPanel.editWordButton.text=Edit Keyword SolrConnectionCheck.Port=Invalid port number. +SolrIndexedText.FileText=File Text +SolrIndexedText.ResultText=Result Text SolrSearch.checkingForLatestIndex.msg=Looking for text index with latest Solr and schema version SolrSearch.complete.msg=Text index successfully opened SolrSearch.creatingNewIndex.msg=Creating new text index diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.form b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.form index 13e535f4ed..c2079a204f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.form +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.form @@ -565,7 +565,7 @@ - + @@ -579,7 +579,7 @@ - + @@ -622,4 +622,4 @@ - + \ No newline at end of file diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java index 41d384d559..0a441ab3b1 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedContentPanel.java @@ -396,7 +396,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP textSourcePanel.add(jLabel1); textSourcePanel.add(fillerSmall12); - sourceComboBox.setModel(new javax.swing.DefaultComboBoxModel()); + sourceComboBox.setModel(new javax.swing.DefaultComboBoxModel()); sourceComboBox.setMaximumSize(new java.awt.Dimension(150, 32767)); sourceComboBox.setMinimumSize(new java.awt.Dimension(150, 25)); sourceComboBox.setPreferredSize(new java.awt.Dimension(150, 25)); @@ -443,7 +443,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP private javax.swing.JLabel pagesLabel; private javax.swing.JPopupMenu rightClickMenu; private javax.swing.JMenuItem selectAllMenuItem; - private javax.swing.JComboBox sourceComboBox; + private javax.swing.JComboBox sourceComboBox; private javax.swing.JPanel textSourcePanel; private javax.swing.JPanel zoomPanel; // End of variables declaration//GEN-END:variables @@ -457,10 +457,10 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * default) * * @param contentName The name of the content to be displayed - * @param sources A list of IndexedText that have different 'views' of - * the content. + * @param sources A list of ExtractedText that have different 'views' of + the content. */ - final void setSources(String contentName, List sources) { + final void setSources(String contentName, List sources) { this.lastKnownAnchor = null; this.contentName = contentName; setPanelText(null, false); @@ -480,8 +480,8 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * * @return currently selected Source */ - public IndexedText getSelectedSource() { - return (IndexedText) sourceComboBox.getSelectedItem(); + public ExtractedText getSelectedSource() { + return (ExtractedText) sourceComboBox.getSelectedItem(); } private void setPanelText(String text, boolean detectDirection) { @@ -636,7 +636,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * * @param source the selected source */ - void updateControls(IndexedText source) { + void updateControls(ExtractedText source) { updatePageControls(source); updateSearchControls(source); } @@ -646,7 +646,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * * @param source selected source */ - void updatePageControls(IndexedText source) { + void updatePageControls(ExtractedText source) { if (source == null) { enableNextPageControl(false); enablePrevPageControl(false); @@ -668,7 +668,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * * @param source selected source */ - void updateSearchControls(IndexedText source) { + void updateSearchControls(ExtractedText source) { //setup search controls if (source != null && source.isSearchable()) { updateCurrentMatchDisplay(source.currentItem()); @@ -688,7 +688,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * * @param source */ - private void scrollToCurrentHit(final IndexedText source) { + private void scrollToCurrentHit(final ExtractedText source) { if (source == null || !source.isSearchable()) { return; } @@ -704,7 +704,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP * be invoked from GUI thread only. */ @NbBundle.Messages("ExtractedContentPanel.setMarkup.panelTxt=Loading text... Please wait") - private void setMarkup(IndexedText source) { + private void setMarkup(ExtractedText source) { setPanelText(Bundle.ExtractedContentPanel_setMarkup_panelTxt(), false); new SetMarkupWorker(contentName, source).execute(); } @@ -718,11 +718,11 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP private final String contentName; - private final IndexedText source; + private final ExtractedText source; private ProgressHandle progress; - SetMarkupWorker(String contentName, IndexedText source) { + SetMarkupWorker(String contentName, ExtractedText source) { this.contentName = contentName; this.source = source; } @@ -753,7 +753,7 @@ class ExtractedContentPanel extends javax.swing.JPanel implements ResizableTextP } } catch (InterruptedException | CancellationException | ExecutionException ex) { logger.log(Level.SEVERE, "Error getting marked up text", ex); //NON-NLS - setPanelText(Bundle.IndexedText_errorMessage_errorGettingText(), true); + setPanelText(Bundle.ExtractedText_errorMessage_errorGettingText(), true); } updateControls(source); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java old mode 100755 new mode 100644 index c0e4e5f6c4..cf1a12e19c --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedText.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2023 Basis Technology Corp. + * Copyright 2011-2023 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,217 +18,131 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import com.google.common.io.CharSource; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.Reader; -import java.util.HashMap; -import java.util.Map; -import java.util.logging.Level; import org.openide.util.NbBundle; -import org.sleuthkit.autopsy.coreutils.EscapeUtil; -import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.textextractors.TextExtractor; -import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; -import org.sleuthkit.datamodel.AbstractFile; /** - * A "source" for abstractFile viewer that displays "raw" extracted text for a - * file. Only supports file types for which there are text extractors. Uses - * chunking algorithm used by KeywordSearchIngestModule. The readers used in - * chunking don't have ability to go backwards or to fast forward to a specific - * offset. Therefore there is no way to scroll pages back, or to determine how - * many total pages there are. + * Interface to provide HTML text to display in ExtractedContentViewer. There is + * a SOLR implementation of this that interfaces with SOLR to highlight the + * keyword hits and a version that does not do markup so that you can simply + * view the stored text. There is also an implementation that extracts text from + * a file using one os TextExtractors. */ -class ExtractedText implements IndexedText { - - private int numPages = 0; - private int currentPage = 0; - private final AbstractFile abstractFile; - private Chunker chunker = null; - private static final Logger logger = Logger.getLogger(ExtractedText.class.getName()); +@NbBundle.Messages({ + "ExtractedText.errorMessage.errorGettingText=Error retrieving text.", + "ExtractedText.warningMessage.knownFile=This file is a known file (based on MD5 hash) and does not have indexed text.", + "ExtractedText.warningMessage.noTextAvailable=No text available for this file." +}) +interface ExtractedText { /** - * Construct a new ExtractedText object for the given abstract file. + * @return text optionally marked up with the subset of HTML that Swing + * components can handle in their setText() method. * - * @param file Abstract file. */ - ExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { - this.abstractFile = file; - this.numPages = -1; // We don't know how many pages there are until we reach end of the document - - TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); - - Map extractedMetadata = new HashMap<>(); - Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata); - - //Get a reader for the content of the given source - BufferedReader reader = new BufferedReader(sourceReader); - this.chunker = new Chunker(reader); - } - - @Override - public int getCurrentPage() { - return this.currentPage; - } - - @Override - public boolean hasNextPage() { - if (chunker.hasNext()) { - return true; - } - return false; - } - - @Override - public boolean hasPreviousPage() { - return false; - } - - @Override - public int nextPage() { - if (!hasNextPage()) { - throw new IllegalStateException( - NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg")); - } - ++currentPage; - return currentPage; - } - - @Override - public int previousPage() { - if (!hasPreviousPage()) { - throw new IllegalStateException( - NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg")); - } - --currentPage; - return currentPage; - } - - @Override - public boolean hasNextItem() { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg")); - } - - @Override - public boolean hasPreviousItem() { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg")); - } - - @Override - public int nextItem() { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg")); - } - - @Override - public int previousItem() { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg")); - } - - @Override - public int currentItem() { - throw new UnsupportedOperationException( - NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg")); - } - - @Override - public String getText() { - try { - return getContentText(currentPage); - } catch (Exception ex) { - logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS - } - return Bundle.IndexedText_errorMessage_errorGettingText(); - } - - @NbBundle.Messages({ - "ExtractedText.FileText=File Text"}) - @Override - public String toString() { - return Bundle.ExtractedText_FileText(); - } - - @Override - public boolean isSearchable() { - return false; - } - - @Override - public String getAnchorPrefix() { - return ""; - } - - @Override - public int getNumberHits() { - return 0; - } - - @Override - public int getNumberPages() { - return numPages; - } + String getText(); /** - * Extract text from abstractFile * - * @param currentPage currently used page - * - * @return the extracted text + * @return true if text is marked to be searchable */ - private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception { - String indexedText; - if (chunker.hasNext()) { - Chunker.Chunk chunk = chunker.next(); - chunk.setChunkId(currentPage); + boolean isSearchable(); - if (chunker.hasException()) { - logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException()); - throw chunker.getException(); - } + /** + * If searchable text, returns prefix of anchor, otherwise return empty + * string + * + * @return + */ + String getAnchorPrefix(); - indexedText = chunk.toString(); - } else { - return Bundle.IndexedText_errorMessage_errorGettingText(); - } + /** + * if searchable text, returns number of hits found and encoded in the text + * + * @return + */ + int getNumberHits(); - indexedText = EscapeUtil.escapeHtml(indexedText).trim(); - StringBuilder sb = new StringBuilder(indexedText.length() + 20); - sb.append("
").append(indexedText).append("
"); //NON-NLS - return sb.toString(); - } + /** + * @return title of text source + */ + @Override + String toString(); - private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile, - Map extractedMetadata) throws TextExtractor.InitReaderException { + /** + * get number pages/chunks + * + * @return number pages + */ + int getNumberPages(); - Reader fileText = extractor.getReader(); - Reader finalReader; - try { - Map metadata = extractor.getMetadata(); - if (!metadata.isEmpty()) { - // save the metadata map to use after this method is complete. - extractedMetadata.putAll(metadata); - } - CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata); - //Append the metadata to end of the file text - finalReader = CharSource.concat(new CharSource() { - //Wrap fileText reader for concatenation - @Override - public Reader openStream() throws IOException { - return fileText; - } - }, formattedMetadata).openStream(); - } catch (IOException ex) { - logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]", - aFile.getName(), aFile.getId()), ex); - //Just send file text. - finalReader = fileText; - } - //divide into chunks - return finalReader; - } + /** + * get the current page number + * + * @return current page number + */ + int getCurrentPage(); + + /** + * Check if has next page + * + * @return true, if next page exists in the source + */ + boolean hasNextPage(); + + /** + * Move to next page + * + * @return the new page number + */ + int nextPage(); + + /** + * Check if has previous page + * + * @return true, if previous page exists in the source + */ + boolean hasPreviousPage(); + + /** + * Move to previous page + * + * @return the new page number + */ + int previousPage(); + + /** + * Check if has next searchable item + * + * @return true, if next item exists in the source + */ + boolean hasNextItem(); + + /** + * Move to next item + * + * @return the new item number + */ + int nextItem(); + + /** + * Check if has previous item + * + * @return true, if previous item exists in the source + */ + boolean hasPreviousItem(); + + /** + * Move to previous item + * + * @return the new item number + */ + int previousItem(); + + /** + * Get the current item number, do not change anything + * + * @return the current item number + */ + int currentItem(); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index 3471e4d055..67f260485d 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -67,7 +67,7 @@ public class ExtractedTextViewer implements TextViewer { private ExtractedContentPanel panel; private volatile Node currentNode = null; - private IndexedText currentSource = null; + private ExtractedText currentSource = null; private FileTypeDetector fileTypeDetector = null; // cache of last 10 solrHasFullyIndexedContent() requests sent to Solr. @@ -118,7 +118,7 @@ public class ExtractedTextViewer implements TextViewer { * Assemble a collection of all of the indexed text "sources" for the * node. */ - List sources = new ArrayList<>(); + List sources = new ArrayList<>(); Lookup nodeLookup = node.getLookup(); /** @@ -134,7 +134,7 @@ public class ExtractedTextViewer implements TextViewer { * First, get text with highlighted hits if this node is for a search * result. */ - IndexedText highlightedHitText = null; + ExtractedText highlightedHitText = null; if (adHocQueryResult != null) { /* * The node is an ad hoc search result node. @@ -172,7 +172,7 @@ public class ExtractedTextViewer implements TextViewer { * Next, add the "raw" (not highlighted) text, if any, for any file * associated with the node. */ - IndexedText rawContentText = null; + ExtractedText rawContentText = null; if (file != null) { // see if Solr has fully indexed this file @@ -184,7 +184,7 @@ public class ExtractedTextViewer implements TextViewer { // see if it's a file type for which we can extract text if (ableToExtractTextFromFile(file)) { try { - rawContentText = new ExtractedText(file); + rawContentText = new FileReaderExtractedText(file); sources.add(rawContentText); } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { // do nothing @@ -209,7 +209,7 @@ public class ExtractedTextViewer implements TextViewer { * Finally, add the "raw" (not highlighted) text, if any, for any * artifact associated with the node. */ - IndexedText rawArtifactText = null; + ExtractedText rawArtifactText = null; try { rawArtifactText = getRawArtifactText(artifact); if (rawArtifactText != null) { @@ -229,7 +229,7 @@ public class ExtractedTextViewer implements TextViewer { } // Push the text sources into the panel. - for (IndexedText source : sources) { + for (ExtractedText source : sources) { int currentPage = source.getCurrentPage(); if (currentPage == 0 && source.hasNextPage()) { source.nextPage(); @@ -245,8 +245,8 @@ public class ExtractedTextViewer implements TextViewer { } - private IndexedText getRawArtifactText(BlackboardArtifact artifact) throws TskCoreException, NoCurrentCaseException { - IndexedText rawArtifactText = null; + private ExtractedText getRawArtifactText(BlackboardArtifact artifact) throws TskCoreException, NoCurrentCaseException { + ExtractedText rawArtifactText = null; if (null != artifact) { /* * For keyword hit artifacts, add the text of the artifact that hit, @@ -273,7 +273,7 @@ public class ExtractedTextViewer implements TextViewer { return rawArtifactText; } - static private IndexedText getAccountsText(Content content, Lookup nodeLookup) throws TskCoreException { + static private ExtractedText getAccountsText(Content content, Lookup nodeLookup) throws TskCoreException { /* * get all the credit card artifacts */ @@ -287,7 +287,7 @@ public class ExtractedTextViewer implements TextViewer { } private void scrollToCurrentHit() { - final IndexedText source = panel.getSelectedSource(); + final ExtractedText source = panel.getSelectedSource(); if (source == null || !source.isSearchable()) { return; } @@ -431,10 +431,10 @@ public class ExtractedTextViewer implements TextViewer { * panel hasn't been created yet) * * @param contentName The name of the content to be displayed - * @param sources A list of IndexedText that have different 'views' of - * the content. + * @param sources A list of ExtractedText that have different 'views' of + the content. */ - private void setPanel(String contentName, List sources) { + private void setPanel(String contentName, List sources) { if (panel != null) { panel.setSources(contentName, sources); } @@ -525,7 +525,7 @@ public class ExtractedTextViewer implements TextViewer { @Override public void actionPerformed(ActionEvent e) { - IndexedText source = panel.getSelectedSource(); + ExtractedText source = panel.getSelectedSource(); if (source == null) { // reset panel.updateControls(null); @@ -568,7 +568,7 @@ public class ExtractedTextViewer implements TextViewer { @Override public void actionPerformed(ActionEvent e) { - IndexedText source = panel.getSelectedSource(); + ExtractedText source = panel.getSelectedSource(); final boolean hasPreviousItem = source.hasPreviousItem(); final boolean hasPreviousPage = source.hasPreviousPage(); int indexVal; diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileReaderExtractedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileReaderExtractedText.java new file mode 100755 index 0000000000..0025be94d8 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileReaderExtractedText.java @@ -0,0 +1,234 @@ +/* + * Autopsy Forensic Browser + * + * Copyright 2023 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import com.google.common.io.CharSource; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Level; +import org.openide.util.NbBundle; +import org.sleuthkit.autopsy.coreutils.EscapeUtil; +import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.autopsy.textextractors.TextExtractor; +import org.sleuthkit.autopsy.textextractors.TextExtractorFactory; +import org.sleuthkit.datamodel.AbstractFile; + +/** + * A "source" for abstractFile viewer that displays "raw" extracted text for a + * file. Only supports file types for which there are text extractors. Uses + * chunking algorithm used by KeywordSearchIngestModule. The readers used in + * chunking don't have ability to go backwards or to fast forward to a specific + * offset. Therefore there is no way to scroll pages back, or to determine how + * many total pages there are. + */ +class FileReaderExtractedText implements ExtractedText { + + private int numPages = 0; + private int currentPage = 0; + private final AbstractFile abstractFile; + private Chunker chunker = null; + private static final Logger logger = Logger.getLogger(FileReaderExtractedText.class.getName()); + + /** + * Construct a new ExtractedText object for the given abstract file. + * + * @param file Abstract file. + */ + FileReaderExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException { + this.abstractFile = file; + this.numPages = -1; // We don't know how many pages there are until we reach end of the document + + TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null); + + Map extractedMetadata = new HashMap<>(); + Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata); + + //Get a reader for the content of the given source + BufferedReader reader = new BufferedReader(sourceReader); + this.chunker = new Chunker(reader); + } + + @Override + public int getCurrentPage() { + return this.currentPage; + } + + @Override + public boolean hasNextPage() { + if (chunker.hasNext()) { + return true; + } + return false; + } + + @Override + public boolean hasPreviousPage() { + return false; + } + + @Override + public int nextPage() { + if (!hasNextPage()) { + throw new IllegalStateException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg")); + } + ++currentPage; + return currentPage; + } + + @Override + public int previousPage() { + if (!hasPreviousPage()) { + throw new IllegalStateException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg")); + } + --currentPage; + return currentPage; + } + + @Override + public boolean hasNextItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg")); + } + + @Override + public boolean hasPreviousItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg")); + } + + @Override + public int nextItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg")); + } + + @Override + public int previousItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg")); + } + + @Override + public int currentItem() { + throw new UnsupportedOperationException( + NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg")); + } + + @Override + public String getText() { + try { + return getContentText(currentPage); + } catch (Exception ex) { + logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS + } + return Bundle.ExtractedText_errorMessage_errorGettingText(); + } + + @NbBundle.Messages({ + "ExtractedText.FileText=File Text"}) + @Override + public String toString() { + return Bundle.ExtractedText_FileText(); + } + + @Override + public boolean isSearchable() { + return false; + } + + @Override + public String getAnchorPrefix() { + return ""; + } + + @Override + public int getNumberHits() { + return 0; + } + + @Override + public int getNumberPages() { + return numPages; + } + + /** + * Extract text from abstractFile + * + * @param currentPage currently used page + * + * @return the extracted text + */ + private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception { + String indexedText; + if (chunker.hasNext()) { + Chunker.Chunk chunk = chunker.next(); + chunk.setChunkId(currentPage); + + if (chunker.hasException()) { + logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException()); + throw chunker.getException(); + } + + indexedText = chunk.toString(); + } else { + return Bundle.ExtractedText_errorMessage_errorGettingText(); + } + + indexedText = EscapeUtil.escapeHtml(indexedText).trim(); + StringBuilder sb = new StringBuilder(indexedText.length() + 20); + sb.append("
").append(indexedText).append("
"); //NON-NLS + return sb.toString(); + } + + private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile, + Map extractedMetadata) throws TextExtractor.InitReaderException { + + Reader fileText = extractor.getReader(); + Reader finalReader; + try { + Map metadata = extractor.getMetadata(); + if (!metadata.isEmpty()) { + // save the metadata map to use after this method is complete. + extractedMetadata.putAll(metadata); + } + CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata); + //Append the metadata to end of the file text + finalReader = CharSource.concat(new CharSource() { + //Wrap fileText reader for concatenation + @Override + public Reader openStream() throws IOException { + return fileText; + } + }, formattedMetadata).openStream(); + } catch (IOException ex) { + logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]", + aFile.getName(), aFile.getId()), ex); + //Just send file text. + finalReader = fileText; + } + //divide into chunks + return finalReader; + } + +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java index dedc81e35e..43d6b5417c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HighlightedText.java @@ -52,7 +52,7 @@ import org.sleuthkit.datamodel.TskCoreException; * Highlights hits for a given document. Knows about pages and such for the * content viewer. */ -class HighlightedText implements IndexedText { +class HighlightedText implements ExtractedText { private static final Logger logger = Logger.getLogger(HighlightedText.class.getName()); @@ -475,7 +475,7 @@ class HighlightedText implements IndexedText { return "
" + highlightedContent + "
"; //NON-NLS } catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) { logger.log(Level.SEVERE, "Error getting highlighted text for Solr doc id " + solrObjectId + ", chunkID " + chunkID + ", highlight query: " + highlightField, ex); //NON-NLS - return Bundle.IndexedText_errorMessage_errorGettingText(); + return Bundle.ExtractedText_errorMessage_errorGettingText(); } } @@ -519,7 +519,7 @@ class HighlightedText implements IndexedText { */ static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection keywords) { if (solrDocumentList.isEmpty()) { - return Bundle.IndexedText_errorMessage_errorGettingText(); + return Bundle.ExtractedText_errorMessage_errorGettingText(); } // It doesn't make sense for there to be more than a single document in diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java deleted file mode 100644 index 880d4c8a0c..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/IndexedText.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2011-2023 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; - -import org.openide.util.NbBundle; - -/** - * Interface to provide HTML text to display in ExtractedContentViewer. There is - * a SOLR implementation of this that interfaces with SOLR to highlight the - * keyword hits and a version that does not do markup so that you can simply - * view the stored text. There is also an implementation that extracts text from - * a file using one os TextExtractors. - */ -@NbBundle.Messages({ - "IndexedText.errorMessage.errorGettingText=Error retrieving text.", - "IndexedText.warningMessage.knownFile=This file is a known file (based on MD5 hash) and does not have indexed text.", - "IndexedText.warningMessage.noTextAvailable=No text available for this file." -}) -interface IndexedText { - - /** - * @return text optionally marked up with the subset of HTML that Swing - * components can handle in their setText() method. - * - */ - String getText(); - - /** - * - * @return true if text is marked to be searchable - */ - boolean isSearchable(); - - /** - * If searchable text, returns prefix of anchor, otherwise return empty - * string - * - * @return - */ - String getAnchorPrefix(); - - /** - * if searchable text, returns number of hits found and encoded in the text - * - * @return - */ - int getNumberHits(); - - /** - * @return title of text source - */ - @Override - String toString(); - - /** - * get number pages/chunks - * - * @return number pages - */ - int getNumberPages(); - - /** - * get the current page number - * - * @return current page number - */ - int getCurrentPage(); - - /** - * Check if has next page - * - * @return true, if next page exists in the source - */ - boolean hasNextPage(); - - /** - * Move to next page - * - * @return the new page number - */ - int nextPage(); - - /** - * Check if has previous page - * - * @return true, if previous page exists in the source - */ - boolean hasPreviousPage(); - - /** - * Move to previous page - * - * @return the new page number - */ - int previousPage(); - - /** - * Check if has next searchable item - * - * @return true, if next item exists in the source - */ - boolean hasNextItem(); - - /** - * Move to next item - * - * @return the new item number - */ - int nextItem(); - - /** - * Check if has previous item - * - * @return true, if previous item exists in the source - */ - boolean hasPreviousItem(); - - /** - * Move to previous item - * - * @return the new item number - */ - int previousItem(); - - /** - * Get the current item number, do not change anything - * - * @return the current item number - */ - int currentItem(); - -} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java index a500c5ef48..32d6fe8fc9 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/SolrIndexedText.java @@ -32,7 +32,7 @@ import org.sleuthkit.datamodel.TskData; * A "source" for the extracted content viewer that displays "raw" (not * highlighted) Solr indexed text for a file or an artifact. */ -class SolrIndexedText implements IndexedText { +class SolrIndexedText implements ExtractedText { private int numPages = 0; private int currentPage = 0; @@ -152,7 +152,7 @@ class SolrIndexedText implements IndexedText { } catch (SolrServerException | NoOpenCoreException ex) { logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS } - return Bundle.IndexedText_errorMessage_errorGettingText(); + return Bundle.ExtractedText_errorMessage_errorGettingText(); } @NbBundle.Messages({ @@ -236,11 +236,11 @@ class SolrIndexedText implements IndexedText { //we know it's AbstractFile, but do quick check to make sure if we index other objects in future boolean isKnown = TskData.FileKnown.KNOWN.equals(((AbstractFile) content).getKnown()); if (isKnown && KeywordSearchSettings.getSkipKnown()) { - msg = Bundle.IndexedText_warningMessage_knownFile(); + msg = Bundle.ExtractedText_warningMessage_knownFile(); } } if (msg == null) { - msg = Bundle.IndexedText_warningMessage_noTextAvailable(); + msg = Bundle.ExtractedText_warningMessage_noTextAvailable(); } return msg; } @@ -250,12 +250,12 @@ class SolrIndexedText implements IndexedText { String indexedText = solrServer.getSolrContent(this.objectId, chunkId); if (indexedText == null) { if (content instanceof AbstractFile) { - return Bundle.IndexedText_errorMessage_errorGettingText(); + return Bundle.ExtractedText_errorMessage_errorGettingText(); } else { - return Bundle.IndexedText_warningMessage_noTextAvailable(); + return Bundle.ExtractedText_warningMessage_noTextAvailable(); } } else if (indexedText.isEmpty()) { - return Bundle.IndexedText_warningMessage_noTextAvailable(); + return Bundle.ExtractedText_warningMessage_noTextAvailable(); } indexedText = EscapeUtil.escapeHtml(indexedText).trim(); @@ -276,7 +276,7 @@ class SolrIndexedText implements IndexedText { private String getArtifactText() throws NoOpenCoreException, SolrServerException { String indexedText = KeywordSearch.getServer().getSolrContent(this.objectId, 1); if (indexedText == null || indexedText.isEmpty()) { - return Bundle.IndexedText_errorMessage_errorGettingText(); + return Bundle.ExtractedText_errorMessage_errorGettingText(); } indexedText = EscapeUtil.escapeHtml(indexedText).trim(); From 3fe6482b42845483e4fddd48a97ba51f7306c3b7 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Wed, 7 Jun 2023 15:50:41 -0400 Subject: [PATCH 8/9] Clearing solr cache when case closes --- .../autopsy/keywordsearch/ExtractedTextViewer.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index 67f260485d..fdcf44a7a2 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -22,8 +22,10 @@ import java.awt.Component; import java.awt.Cursor; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; +import java.beans.PropertyChangeEvent; import java.util.ArrayList; import java.util.Collection; +import java.util.EnumSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -72,7 +74,6 @@ public class ExtractedTextViewer implements TextViewer { // cache of last 10 solrHasFullyIndexedContent() requests sent to Solr. private SolrIsFullyIndexedCache solrCache = null; - // ELTODO clear the cache when case closes /** * Constructs a text viewer that displays the indexed text associated with a @@ -88,6 +89,10 @@ public class ExtractedTextViewer implements TextViewer { } solrCache = new SolrIsFullyIndexedCache(); + // clear the cache when case opens or closes + Case.addEventTypeSubscriber(EnumSet.of(Case.Events.CURRENT_CASE), (PropertyChangeEvent evt) -> { + solrCache.clearCache(); + }); } /** From 816f166af1d0a597025a950c7886f32bc255a417 Mon Sep 17 00:00:00 2001 From: "eugene.livis" Date: Wed, 7 Jun 2023 16:28:00 -0400 Subject: [PATCH 9/9] Improved file reader detection --- .../autopsy/keywordsearch/ExtractedTextViewer.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java index fdcf44a7a2..6c5499aff5 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ExtractedTextViewer.java @@ -520,6 +520,15 @@ public class ExtractedTextViewer implements TextViewer { return false; } + // Often times there is an exception when trying to initiale a reader, + // thus making that specific file "unsupported". The only way to identify + // this situation is to initialize the reader. + try { + FileReaderExtractedText tmp = new FileReaderExtractedText(file); + } catch (TextExtractorFactory.NoTextExtractorFound | TextExtractor.InitReaderException ex) { + return false; + } + return true; }