Cleaned up code and added comments

This commit is contained in:
U-BASIS\dsmyda 2019-05-07 09:06:53 -04:00
parent 38049bc73c
commit 7669590455
3 changed files with 14 additions and 13 deletions

View File

@ -25,7 +25,6 @@ import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level; import java.util.logging.Level;
import net.htmlparser.jericho.Attributes; import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Config; import net.htmlparser.jericho.Config;
@ -86,6 +85,12 @@ final class HtmlTextExtractor implements TextExtractor {
&& file.getSize() <= MAX_SIZE; && file.getSize() <= MAX_SIZE;
} }
/**
* Get the metadata as a key -> value map. HTML metadata will include
* scripts, links, images, comments, and misc attributes.
*
* @return Map containing metadata key -> value pairs.
*/
@Override @Override
public Map<String, String> getMetadata() { public Map<String, String> getMetadata() {
Map<String, String> metadataMap = new HashMap<>(); Map<String, String> metadataMap = new HashMap<>();

View File

@ -33,7 +33,6 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.TreeMap;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
@ -409,7 +408,7 @@ final class TikaTextExtractor implements TextExtractor {
/** /**
* Get the content metdata * Get the content metdata
* *
* @return Metadata name -> value * @return Metadata as a name -> value map
*/ */
@Override @Override
public Map<String, String> getMetadata() { public Map<String, String> getMetadata() {

View File

@ -28,9 +28,6 @@ import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.tika.metadata.Metadata;
import org.openide.util.Exceptions;
import org.openide.util.Lookup; import org.openide.util.Lookup;
import org.openide.util.NbBundle; import org.openide.util.NbBundle;
import org.openide.util.NbBundle.Messages; import org.openide.util.NbBundle.Messages;
@ -477,13 +474,12 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
* *
* @param aFile file to extract strings from, divide into chunks and * @param aFile file to extract strings from, divide into chunks and
* index * index
* @param detectedFormat mime-type detected, or null if none detected
* *
* @return true if the file was text_ingested, false otherwise * @return true if the file was text_ingested, false otherwise
* *
* @throws IngesterException exception thrown if indexing failed * @throws IngesterException exception thrown if indexing failed
*/ */
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException { private boolean extractTextAndIndex(AbstractFile aFile) throws IngesterException {
ImageConfig imageConfig = new ImageConfig(); ImageConfig imageConfig = new ImageConfig();
imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption()); imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
ProcessTerminator terminator = () -> context.fileIngestIsCancelled(); ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
@ -497,8 +493,9 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
try { try {
Map<String, String> metadata = extractor.getMetadata(); Map<String, String> metadata = extractor.getMetadata();
CharSource formattedMetadata = getMetaDataCharSource(metadata); CharSource formattedMetadata = getMetaDataCharSource(metadata);
//Append the metadata to end of the file text
finalReader = CharSource.concat(new CharSource() { finalReader = CharSource.concat(new CharSource() {
//Wrap the TikaReader into a CharSource for concatenation //Wrap fileText reader for concatenation
@Override @Override
public Reader openStream() throws IOException { public Reader openStream() throws IOException {
return fileText; return fileText;
@ -518,11 +515,11 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
} }
/** /**
* Format the * Pretty print the text extractor metadata.
* *
* @param metadata The Metadata to wrap as a CharSource * @param metadata The Metadata map to wrap as a CharSource
* *
* @return A CharSource for the given MetaData * @return A CharSource for the given Metadata
*/ */
private CharSource getMetaDataCharSource(Map<String, String> metadata) { private CharSource getMetaDataCharSource(Map<String, String> metadata) {
return CharSource.wrap(new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n") return CharSource.wrap(new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
@ -633,7 +630,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
extractStringsAndIndex(aFile); extractStringsAndIndex(aFile);
return; return;
} }
if (!extractTextAndIndex(aFile, fileType)) { if (!extractTextAndIndex(aFile)) {
// Text extractor not found for file. Extract string only. // Text extractor not found for file. Extract string only.
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT); putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
} else { } else {