From c9240668130381012cb7454f4f0b52b97c99219a Mon Sep 17 00:00:00 2001 From: esaunders Date: Tue, 9 Jan 2018 17:49:24 -0500 Subject: [PATCH 1/5] Update file extractor and Tika text extractor to use new Tika SAX parsers in order to reduce the amount of memory required to process DOCX and PPTX files. --- .../MSOfficeEmbeddedContentExtractor.java | 13 ++++++++----- .../keywordsearch/TikaTextExtractor.java | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/MSOfficeEmbeddedContentExtractor.java b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/MSOfficeEmbeddedContentExtractor.java index 4a3a20da07..7fca6a896c 100644 --- a/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/MSOfficeEmbeddedContentExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/modules/embeddedfileextractor/MSOfficeEmbeddedContentExtractor.java @@ -51,6 +51,7 @@ import org.apache.tika.mime.MimeTypeException; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.sax.BodyContentHandler; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.casemodule.Case; @@ -231,11 +232,13 @@ class MSOfficeEmbeddedContentExtractor { // write limit (which defaults to 100,000 characters. ContentHandler contentHandler = new BodyContentHandler(-1); - // TODO: this will be needed once we upgrade to Tika 1.16 or later. - // OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - // officeParserConfig.setUseSAXPptxExtractor(true); - // officeParserConfig.setUseSAXDocxExtractor(true); - // parseContext.set(OfficeParserConfig.class, officeParserConfig); + // Use the more memory efficient Tika SAX parsers for DOCX and + // PPTX files (it already uses SAX for XLSX). + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setUseSAXPptxExtractor(true); + officeParserConfig.setUseSAXDocxExtractor(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext); parseContext.set(EmbeddedDocumentExtractor.class, extractor); ReadContentInputStream stream = new ReadContentInputStream(abstractFile); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java index fcbace158d..4b9c3b17cb 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java @@ -33,7 +33,11 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParsingReader; +import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.datamodel.AbstractFile; @@ -48,6 +52,8 @@ class TikaTextExtractor extends FileTextExtractor { static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName()); private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); + private final AutoDetectParser parser = new AutoDetectParser(); + private static final List TIKA_SUPPORTED_TYPES = new Tika().getParser().getSupportedTypes(new ParseContext()) .stream() @@ -64,8 +70,18 @@ class TikaTextExtractor extends FileTextExtractor { ReadContentInputStream stream = new ReadContentInputStream(sourceFile); Metadata metadata = new Metadata(); + ParseContext parseContext = new ParseContext(); + parseContext.set(Parser.class, parser); + + // Use the more memory efficient Tika SAX parsers for DOCX and + // PPTX files (it already uses SAX for XLSX). + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setUseSAXPptxExtractor(true); + officeParserConfig.setUseSAXDocxExtractor(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + //Parse the file in a task, a convenient way to have a timeout... - final Future future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata)); + final Future future = tikaParseExecutor.submit(() -> new ParsingReader(parser, stream, metadata, parseContext)); try { final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); From 44496e611aca92e01f38e37ccb5b837ffbe3439b Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dgrove" Date: Thu, 11 Jan 2018 02:49:16 -0500 Subject: [PATCH 2/5] Added node for displaying no results message. --- .../KeywordSearchResultFactory.java | 84 +++++++++++-------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java index a48b40e756..4c36ce9b78 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2017 Basis Technology Corp. + * Copyright 2011-2018 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -39,11 +39,13 @@ import org.openide.nodes.Node; import org.openide.util.NbBundle; import org.openide.util.lookup.Lookups; import org.sleuthkit.autopsy.casemodule.Case; +import org.sleuthkit.autopsy.corecomponents.TableFilterNode; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil; import org.sleuthkit.autopsy.datamodel.AbstractAbstractFileNode; import static org.sleuthkit.autopsy.datamodel.AbstractAbstractFileNode.AbstractFilePropertyType.LOCATION; import org.sleuthkit.autopsy.datamodel.AbstractFsContentNode; +import org.sleuthkit.autopsy.datamodel.EmptyNode; import org.sleuthkit.autopsy.datamodel.KeyValue; import org.sleuthkit.autopsy.datamodel.KeyValueNode; import org.sleuthkit.autopsy.keywordsearch.KeywordSearchResultFactory.KeyValueQueryContent; @@ -63,21 +65,21 @@ import org.sleuthkit.datamodel.TskCoreException; * Responsible for assembling nodes and columns in the right way and performing * lazy queries as needed. */ -class KeywordSearchResultFactory extends ChildFactory { +class KeywordSearchResultFactory extends ChildFactory { - private static final Logger logger = Logger.getLogger(KeywordSearchResultFactory.class.getName()); + private static final Logger LOGGER = Logger.getLogger(KeywordSearchResultFactory.class.getName()); //common properties (superset of all Node properties) to be displayed as columns - static final List COMMON_PROPERTIES = - Stream.concat( + static final List COMMON_PROPERTIES + = Stream.concat( Stream.of( TSK_KEYWORD, TSK_KEYWORD_REGEXP, TSK_KEYWORD_PREVIEW) - .map(BlackboardAttribute.ATTRIBUTE_TYPE::getDisplayName), + .map(BlackboardAttribute.ATTRIBUTE_TYPE::getDisplayName), Arrays.stream(AbstractAbstractFileNode.AbstractFilePropertyType.values()) - .map(Object::toString)) - .collect(Collectors.toList()); + .map(Object::toString)) + .collect(Collectors.toList()); private final Collection queryRequests; @@ -93,7 +95,7 @@ class KeywordSearchResultFactory extends ChildFactory { * @param toPopulate property set map for a Node */ @Override - protected boolean createKeys(List toPopulate) { + protected boolean createKeys(List toPopulate) { for (QueryRequest queryRequest : queryRequests) { /** @@ -130,7 +132,7 @@ class KeywordSearchResultFactory extends ChildFactory { * @return */ @NbBundle.Messages({"KeywordSearchResultFactory.query.exception.msg=Could not perform the query "}) - private boolean createFlatKeys(KeywordSearchQuery queryRequest, List toPopulate) { + private boolean createFlatKeys(KeywordSearchQuery queryRequest, List toPopulate) { /** * Execute the requested query. @@ -139,15 +141,15 @@ class KeywordSearchResultFactory extends ChildFactory { try { queryResults = queryRequest.performQuery(); } catch (KeywordSearchModuleException | NoOpenCoreException ex) { - logger.log(Level.SEVERE, "Could not perform the query " + queryRequest.getQueryString(), ex); //NON-NLS + LOGGER.log(Level.SEVERE, "Could not perform the query " + queryRequest.getQueryString(), ex); //NON-NLS MessageNotifyUtil.Notify.error(Bundle.KeywordSearchResultFactory_query_exception_msg() + queryRequest.getQueryString(), ex.getCause().getMessage()); return false; } - SleuthkitCase tskCase = null; + SleuthkitCase tskCase; try { tskCase = Case.getCurrentCase().getSleuthkitCase(); } catch (IllegalStateException ex) { - logger.log(Level.SEVERE, "There was no case open.", ex); //NON-NLS + LOGGER.log(Level.SEVERE, "There was no case open.", ex); //NON-NLS return false; } @@ -159,16 +161,16 @@ class KeywordSearchResultFactory extends ChildFactory { * Get file properties. */ Map properties = new LinkedHashMap<>(); - Content content = null; - String contentName = ""; + Content content; + String contentName; try { content = tskCase.getContentById(hit.getContentID()); if (content == null) { - logger.log(Level.SEVERE, "There was a error getting content by id."); //NON-NLS + LOGGER.log(Level.SEVERE, "There was a error getting content by id."); //NON-NLS return false; } } catch (TskCoreException ex) { - logger.log(Level.SEVERE, "There was a error getting content by id.", ex); //NON-NLS + LOGGER.log(Level.SEVERE, "There was a error getting content by id.", ex); //NON-NLS return false; } @@ -191,7 +193,7 @@ class KeywordSearchResultFactory extends ChildFactory { try { hitName = tskCase.getBlackboardArtifact(hit.getArtifactID().get()).getDisplayName() + " Artifact"; //NON-NLS } catch (TskCoreException ex) { - logger.log(Level.SEVERE, "Error getting blckboard artifact by id", ex); + LOGGER.log(Level.SEVERE, "Error getting blckboard artifact by id", ex); return false; } } else { @@ -202,9 +204,13 @@ class KeywordSearchResultFactory extends ChildFactory { } - // Add all the nodes to toPopulate at once. Minimizes node creation - // EDT threads, which can slow and/or hang the UI on large queries. - toPopulate.addAll(tempList); + if (hitNumber == 0) { + toPopulate.add(new KeyValue("This KeyValue Is Empty", 0)); + } else { + // Add all the nodes to toPopulate at once. Minimizes node creation + // EDT threads, which can slow and/or hang the UI on large queries. + toPopulate.addAll(tempList); + } //write to bb //cannot reuse snippet in BlackboardResultWriter @@ -239,15 +245,25 @@ class KeywordSearchResultFactory extends ChildFactory { return hits.values(); } + @NbBundle.Messages({"KeywordSearchResultFactory.createNodeForKey.noMatchessFound.text=No matches found."}) @Override - protected Node createNodeForKey(KeyValueQueryContent key) { - final Content content = key.getContent(); - QueryResults hits = key.getHits(); + protected Node createNodeForKey(KeyValue key) { + Node resultNode; - Node kvNode = new KeyValueNode(key, Children.LEAF, Lookups.singleton(content)); + if (key instanceof KeyValueQueryContent) { + final Content content = ((KeyValueQueryContent) key).getContent(); + QueryResults hits = ((KeyValueQueryContent) key).getHits(); - //wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization - return new KeywordSearchFilterNode(hits, kvNode); + Node kvNode = new KeyValueNode(key, Children.LEAF, Lookups.singleton(content)); + + //wrap in KeywordSearchFilterNode for the markup content, might need to override FilterNode for more customization + resultNode = new KeywordSearchFilterNode(hits, kvNode); + } else { + resultNode = new EmptyNode("This Node Is Empty"); + resultNode.setDisplayName(NbBundle.getMessage(this.getClass(), "KeywordSearchResultFactory.createNodeForKey.noMatchessFound.text")); + } + + return resultNode; } @@ -308,7 +324,7 @@ class KeywordSearchResultFactory extends ChildFactory { */ static class BlackboardResultWriter extends SwingWorker { - private static final List writers = new ArrayList<>(); + private static final List WRITERS = new ArrayList<>(); private ProgressHandle progress; private final KeywordSearchQuery query; private final QueryResults hits; @@ -343,24 +359,24 @@ class KeywordSearchResultFactory extends ChildFactory { try { get(); } catch (InterruptedException | CancellationException ex) { - logger.log(Level.WARNING, "User cancelled writing of ad hoc search query results for '{0}' to the blackboard", query.getQueryString()); //NON-NLS + LOGGER.log(Level.WARNING, "User cancelled writing of ad hoc search query results for '{0}' to the blackboard", query.getQueryString()); //NON-NLS } catch (ExecutionException ex) { - logger.log(Level.SEVERE, "Error writing of ad hoc search query results for " + query.getQueryString() + " to the blackboard", ex); //NON-NLS + LOGGER.log(Level.SEVERE, "Error writing of ad hoc search query results for " + query.getQueryString() + " to the blackboard", ex); //NON-NLS } } private static synchronized void registerWriter(BlackboardResultWriter writer) { - writers.add(writer); + WRITERS.add(writer); } private static synchronized void deregisterWriter(BlackboardResultWriter writer) { - writers.remove(writer); + WRITERS.remove(writer); } static synchronized void stopAllWriters() { - for (BlackboardResultWriter w : writers) { + for (BlackboardResultWriter w : WRITERS) { w.cancel(true); - writers.remove(w); + WRITERS.remove(w); } } } From 3cce43e95d9820a4adde842bde2b0c3490972d9a Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dgrove" Date: Thu, 11 Jan 2018 03:20:14 -0500 Subject: [PATCH 3/5] Fixed message. --- .../autopsy/keywordsearch/KeywordSearchResultFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java index 4c36ce9b78..d0ea6b8fff 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java @@ -245,7 +245,7 @@ class KeywordSearchResultFactory extends ChildFactory { return hits.values(); } - @NbBundle.Messages({"KeywordSearchResultFactory.createNodeForKey.noMatchessFound.text=No matches found."}) + @NbBundle.Messages({"KeywordSearchResultFactory.createNodeForKey.noResultsFound.text=No results found."}) @Override protected Node createNodeForKey(KeyValue key) { Node resultNode; @@ -260,7 +260,7 @@ class KeywordSearchResultFactory extends ChildFactory { resultNode = new KeywordSearchFilterNode(hits, kvNode); } else { resultNode = new EmptyNode("This Node Is Empty"); - resultNode.setDisplayName(NbBundle.getMessage(this.getClass(), "KeywordSearchResultFactory.createNodeForKey.noMatchessFound.text")); + resultNode.setDisplayName(NbBundle.getMessage(this.getClass(), "KeywordSearchResultFactory.createNodeForKey.noResultsFound.text")); } return resultNode; From 3ec32ec4cfeab757f8c11f33cd1ad27ec2071d08 Mon Sep 17 00:00:00 2001 From: "U-BASIS\\dgrove" Date: Thu, 11 Jan 2018 10:25:53 -0500 Subject: [PATCH 4/5] Cleanup. --- .../autopsy/keywordsearch/KeywordSearchResultFactory.java | 1 - 1 file changed, 1 deletion(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java index d0ea6b8fff..a8bc995eae 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchResultFactory.java @@ -39,7 +39,6 @@ import org.openide.nodes.Node; import org.openide.util.NbBundle; import org.openide.util.lookup.Lookups; import org.sleuthkit.autopsy.casemodule.Case; -import org.sleuthkit.autopsy.corecomponents.TableFilterNode; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil; import org.sleuthkit.autopsy.datamodel.AbstractAbstractFileNode; From 6a976324ee9a8b22d9536e93289d5c26fb57575f Mon Sep 17 00:00:00 2001 From: William Schaefer Date: Fri, 12 Jan 2018 14:15:43 -0500 Subject: [PATCH 5/5] Fix releaseing of correct lock for processing Raw data image --- .../sleuthkit/autopsy/datasourceprocessors/AddRawImageTask.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Core/src/org/sleuthkit/autopsy/datasourceprocessors/AddRawImageTask.java b/Core/src/org/sleuthkit/autopsy/datasourceprocessors/AddRawImageTask.java index 14a3dc9e2f..3d576eebf0 100644 --- a/Core/src/org/sleuthkit/autopsy/datasourceprocessors/AddRawImageTask.java +++ b/Core/src/org/sleuthkit/autopsy/datasourceprocessors/AddRawImageTask.java @@ -178,7 +178,7 @@ final class AddRawImageTask implements Runnable { logger.log(Level.SEVERE, errorMessage, ex); criticalErrorOccurred = true; } finally { - caseDatabase.releaseSingleUserCaseReadLock(); + caseDatabase.releaseSingleUserCaseWriteLock(); } }