From 4c5ef90e480ee9d4dd8a185be7f4305d37f3814b Mon Sep 17 00:00:00 2001 From: Brian Carrier Date: Thu, 16 May 2019 16:04:28 -0400 Subject: [PATCH] don't force newlines in HTML extraction. This was causing problems with NLP use cases --- .../org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java index d117e80537..80218d039f 100644 --- a/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java +++ b/Core/src/org/sleuthkit/autopsy/textextractors/HtmlTextExtractor.java @@ -198,6 +198,7 @@ final class HtmlTextExtractor implements TextExtractor { renderer.setIncludeHyperlinkURLs(false); renderer.setDecorateFontStyles(false); renderer.setIncludeAlternateText(false); + renderer.setMaxLineLength(0); // don't force wrapping return new StringReader(renderer.toString()); } catch (IOException ex) { logger.log(Level.WARNING, "Error extracting HTML from content.", ex);