diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java index 1fa93609ea..3ccf669b58 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java @@ -259,6 +259,15 @@ class TikaTextExtractor implements TextExtractor { * This method determines if a passed-in Java char (16 bits) is a valid * UTF-8 printable character, returning true if so, false if not. * + * Note that this method can have ramifications for characters outside the + * Unicode Base Multilingual Plane (BMP), which require more than 16 bits. + * We are using Java characters (16 bits) to look at the data and this will + * not accurately identify any non-BMP character (larger than 16 bits) + * ending with 0xFFFF and 0xFFFE. In the interest of a fast solution, we + * have chosen to ignore the extended planes above Unicode BMP for the time + * being. The net result of this is some non-BMP characters may be + * interspersed with '^' characters in Autopsy. + * * @param ch the character to test * * @return Returns true if the character is valid UTF-8, false if not.