diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java index 19a42d3a3b..ae7071997b 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java @@ -25,7 +25,6 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.TreeMap; import java.util.logging.Level; import net.htmlparser.jericho.Attributes; import net.htmlparser.jericho.Config; @@ -86,6 +85,12 @@ final class HtmlTextExtractor implements TextExtractor { && file.getSize() <= MAX_SIZE; } + /** + * Get the metadata as a key -> value map. HTML metadata will include + * scripts, links, images, comments, and misc attributes. + * + * @return Map containing metadata key -> value pairs. + */ @Override public Map getMetadata() { Map metadataMap = new HashMap<>(); diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java index f86bd2c1d8..1260c1bf28 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java @@ -33,7 +33,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -409,7 +408,7 @@ final class TikaTextExtractor implements TextExtractor { /** * Get the content metdata * - * @return Metadata name -> value + * @return Metadata as a name -> value map */ @Override public Map getMetadata() { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java index ece7c1d25c..31ab471d24 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java @@ -28,9 +28,6 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.tika.metadata.Metadata; -import org.openide.util.Exceptions; import org.openide.util.Lookup; import org.openide.util.NbBundle; import org.openide.util.NbBundle.Messages; @@ -477,13 +474,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule { * * @param aFile file to extract strings from, divide into chunks and * index - * @param detectedFormat mime-type detected, or null if none detected * * @return true if the file was text_ingested, false otherwise * * @throws IngesterException exception thrown if indexing failed */ - private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException { + private boolean extractTextAndIndex(AbstractFile aFile) throws IngesterException { ImageConfig imageConfig = new ImageConfig(); imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption()); ProcessTerminator terminator = () -> context.fileIngestIsCancelled(); @@ -497,8 +493,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule { try { Map metadata = extractor.getMetadata(); CharSource formattedMetadata = getMetaDataCharSource(metadata); + //Append the metadata to end of the file text finalReader = CharSource.concat(new CharSource() { - //Wrap the TikaReader into a CharSource for concatenation + //Wrap fileText reader for concatenation @Override public Reader openStream() throws IOException { return fileText; @@ -518,11 +515,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule { } /** - * Format the + * Pretty print the text extractor metadata. * - * @param metadata The Metadata to wrap as a CharSource + * @param metadata The Metadata map to wrap as a CharSource * - * @return A CharSource for the given MetaData + * @return A CharSource for the given Metadata */ private CharSource getMetaDataCharSource(Map metadata) { return CharSource.wrap(new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n") @@ -633,7 +630,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule { extractStringsAndIndex(aFile); return; } - if (!extractTextAndIndex(aFile, fileType)) { + if (!extractTextAndIndex(aFile)) { // Text extractor not found for file. Extract string only. putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT); } else {