Cleaned up code and added comments

2025-07-12 16:06:15 +00:00 · 2019-05-07 09:06:53 -04:00 · 2019-05-07 09:06:53 -04:00 · 7669590455
commit 7669590455
parent 38049bc73c
3 changed files with 14 additions and 13 deletions
--- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java
@ -25,7 +25,6 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.TreeMap;
 import java.util.logging.Level;
 import net.htmlparser.jericho.Attributes;
 import net.htmlparser.jericho.Config;
@ -86,6 +85,12 @@ final class HtmlTextExtractor implements TextExtractor {
                && file.getSize() <= MAX_SIZE;
    }

+    /**
+     * Get the metadata as a key -> value map. HTML metadata will include
+     * scripts, links, images, comments, and misc attributes.
+     * 
+     * @return Map containing metadata key -> value pairs.
+     */
    @Override
    public Map<String, String> getMetadata() {
        Map<String, String> metadataMap = new HashMap<>();
--- a/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
+++ b/Core/src/org/sleuthkit/autopsy/textextractors/TikaTextExtractor.java
@ -33,7 +33,6 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
-import java.util.TreeMap;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
@ -409,7 +408,7 @@ final class TikaTextExtractor implements TextExtractor {
    /**
     * Get the content metdata
     * 
-     * @return Metadata name -> value
+     * @return Metadata as a name -> value map
     */
    @Override
    public Map<String, String> getMetadata() {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchIngestModule.java
@ -28,9 +28,6 @@ import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Level;
 import java.util.stream.Collectors;
-import java.util.stream.Stream;
-import org.apache.tika.metadata.Metadata;
-import org.openide.util.Exceptions;
 import org.openide.util.Lookup;
 import org.openide.util.NbBundle;
 import org.openide.util.NbBundle.Messages;
@ -477,13 +474,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
         *
         * @param aFile file to extract strings from, divide into chunks and
         * index
-         * @param detectedFormat mime-type detected, or null if none detected
         *
         * @return true if the file was text_ingested, false otherwise
         *
         * @throws IngesterException exception thrown if indexing failed
         */
-        private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
+        private boolean extractTextAndIndex(AbstractFile aFile) throws IngesterException {
            ImageConfig imageConfig = new ImageConfig();
            imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
            ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
@ -497,8 +493,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                try {
                    Map<String, String> metadata = extractor.getMetadata();
                    CharSource formattedMetadata = getMetaDataCharSource(metadata);
+                    //Append the metadata to end of the file text
                    finalReader = CharSource.concat(new CharSource() {
-                        //Wrap the TikaReader into a CharSource for concatenation
+                        //Wrap fileText reader for concatenation
                        @Override
                        public Reader openStream() throws IOException {
                            return fileText;
@ -518,11 +515,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
        }

        /**
-         * Format the
+         * Pretty print the text extractor metadata.
         *
-         * @param metadata The Metadata to wrap as a CharSource
+         * @param metadata The Metadata map to wrap as a CharSource
         *
-         * @return A CharSource for the given MetaData
+         * @return A CharSource for the given Metadata
         */
        private CharSource getMetaDataCharSource(Map<String, String> metadata) {
            return CharSource.wrap(new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
@ -633,7 +630,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
                    extractStringsAndIndex(aFile);
                    return;
                }
-                if (!extractTextAndIndex(aFile, fileType)) {
+                if (!extractTextAndIndex(aFile)) {
                    // Text extractor not found for file. Extract string only.
                    putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
                } else {