THRIFT-2411 - C++: Fixed support for UTF-16 encoding in JSON protocol

Client: C++
Patch: Phongphan Phuttha <phongphan@acm.org>

Support unicode-encoded character including UTF-16 surrogate pair which
allow extended character that is not in Basic Multilingual Plane.

This closes #648
diff --git a/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp b/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp
index 6865fdc..f1370bb 100644
--- a/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp
+++ b/lib/cpp/src/thrift/protocol/TJSONProtocol.cpp
@@ -26,6 +26,7 @@
 
 #include <boost/math/special_functions/fpclassify.hpp>
 #include <boost/lexical_cast.hpp>
+#include <boost/locale.hpp>
 
 #include <thrift/protocol/TBase64Utils.h>
 #include <thrift/transport/TTransportException.h>
@@ -285,6 +286,16 @@
   return false;
 }
 
+// Return true if the code unit is high surrogate
+static bool isHighSurrogate(uint16_t val) {
+  return val >= 0xD800 && val <= 0xDBFF;
+}
+
+// Return true if the code unit is low surrogate
+static bool isLowSurrogate(uint16_t val) {
+  return val >= 0xDC00 && val <= 0xDFFF;
+}
+
 /**
  * Class to serve as base JSON context and as base class for other context
  * implementations
@@ -709,14 +720,17 @@
 }
 
 // Decodes the four hex parts of a JSON escaped string character and returns
-// the character via out. The first two characters must be "00".
-uint32_t TJSONProtocol::readJSONEscapeChar(uint8_t* out) {
-  uint8_t b[2];
-  readJSONSyntaxChar(kJSONZeroChar);
-  readJSONSyntaxChar(kJSONZeroChar);
+// the UTF-16 code unit via out.
+uint32_t TJSONProtocol::readJSONEscapeChar(uint16_t* out) {
+  uint8_t b[4];
   b[0] = reader_.read();
   b[1] = reader_.read();
-  *out = (hexVal(b[0]) << 4) + hexVal(b[1]);
+  b[2] = reader_.read();
+  b[3] = reader_.read();
+
+  *out = (hexVal(b[0]) << 12)
+    + (hexVal(b[1]) << 8) + (hexVal(b[2]) << 4) + hexVal(b[3]);
+
   return 4;
 }
 
@@ -724,6 +738,7 @@
 uint32_t TJSONProtocol::readJSONString(std::string& str, bool skipContext) {
   uint32_t result = (skipContext ? 0 : context_->read(reader_));
   result += readJSONSyntaxChar(kJSONStringDelimiter);
+  std::vector<uint16_t> codeunits;
   uint8_t ch;
   str.clear();
   while (true) {
@@ -736,7 +751,22 @@
       ch = reader_.read();
       ++result;
       if (ch == kJSONEscapeChar) {
-        result += readJSONEscapeChar(&ch);
+        uint16_t cp;
+        result += readJSONEscapeChar(&cp);
+        if (isHighSurrogate(cp)) {
+          codeunits.push_back(cp);
+        } else {
+          if (isLowSurrogate(cp)
+               && codeunits.empty()) {
+            throw TProtocolException(TProtocolException::INVALID_DATA,
+                                     "Missing UTF-16 high surrogate pair.");
+          }
+          codeunits.push_back(cp);
+          codeunits.push_back(0);
+          str += boost::locale::conv::utf_to_utf<char>(codeunits.data());
+          codeunits.clear();
+        }
+        continue;
       } else {
         size_t pos = kEscapeChars.find(ch);
         if (pos == std::string::npos) {
@@ -747,8 +777,17 @@
         ch = kEscapeCharVals[pos];
       }
     }
+    if (!codeunits.empty()) {
+      throw TProtocolException(TProtocolException::INVALID_DATA,
+                               "Missing UTF-16 low surrogate pair.");
+    }
     str += ch;
   }
+
+  if (!codeunits.empty()) {
+    throw TProtocolException(TProtocolException::INVALID_DATA,
+                             "Missing UTF-16 low surrogate pair.");
+  }
   return result;
 }
 
diff --git a/lib/cpp/src/thrift/protocol/TJSONProtocol.h b/lib/cpp/src/thrift/protocol/TJSONProtocol.h
index 80d68a4..5834eff 100644
--- a/lib/cpp/src/thrift/protocol/TJSONProtocol.h
+++ b/lib/cpp/src/thrift/protocol/TJSONProtocol.h
@@ -127,7 +127,7 @@
 
   uint32_t readJSONSyntaxChar(uint8_t ch);
 
-  uint32_t readJSONEscapeChar(uint8_t* out);
+  uint32_t readJSONEscapeChar(uint16_t* out);
 
   uint32_t readJSONString(std::string& str, bool skipContext = false);
 
diff --git a/lib/cpp/test/JSONProtoTest.cpp b/lib/cpp/test/JSONProtoTest.cpp
index f03b2ca..e9fe8b5 100644
--- a/lib/cpp/test/JSONProtoTest.cpp
+++ b/lib/cpp/test/JSONProtoTest.cpp
@@ -19,6 +19,8 @@
 
 #define _USE_MATH_DEFINES
 #include <cmath>
+#include <iomanip>
+#include <sstream>
 #include <thrift/transport/TBufferTransports.h>
 #include <thrift/protocol/TJSONProtocol.h>
 #include "gen-cpp/DebugProtoTest_types.h"
@@ -269,3 +271,70 @@
   BOOST_CHECK_THROW(ooe2.read(proto.get()),
     apache::thrift::protocol::TProtocolException);
 }
+
+static std::string toHexSequence(const std::string& str) {
+  std::stringstream ss;
+  ss << std::hex << std::setfill('0');
+  for (std::size_t i = 0; i < str.size(); i++) {
+    ss << "\\x" << int(uint8_t(str[i]));
+  }
+  return ss.str();
+}
+
+BOOST_AUTO_TEST_CASE(test_json_unicode_escaped) {
+  const char json_string[] =
+  "{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000},"
+  "\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926"
+  "53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\u0e01 \\ud835\\udd3e\"},"
+  "\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\""
+  ":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64"
+  "\",3,1,2,3]}}";
+  const char* expected_zomg_unicode = "\xe0\xb8\x81 \xf0\x9d\x94\xbe";
+
+  boost::shared_ptr<TMemoryBuffer> buffer(new TMemoryBuffer(
+    (uint8_t*)(json_string), sizeof(json_string)));
+  boost::shared_ptr<TJSONProtocol> proto(new TJSONProtocol(buffer));
+
+  OneOfEach ooe2;
+  ooe2.read(proto.get());
+  BOOST_CHECK_MESSAGE(!ooe2.zomg_unicode.compare(expected_zomg_unicode),
+    "Expected:\n" << toHexSequence(expected_zomg_unicode) << "\nGotten:\n"
+                  << toHexSequence(ooe2.zomg_unicode));
+
+}
+
+BOOST_AUTO_TEST_CASE(test_json_unicode_escaped_missing_low_surrogate) {
+  const char json_string[] =
+  "{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000},"
+  "\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926"
+  "53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\ud835\"},"
+  "\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\""
+  ":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64"
+  "\",3,1,2,3]}}";
+
+  boost::shared_ptr<TMemoryBuffer> buffer(new TMemoryBuffer(
+    (uint8_t*)(json_string), sizeof(json_string)));
+  boost::shared_ptr<TJSONProtocol> proto(new TJSONProtocol(buffer));
+
+  OneOfEach ooe2;
+  BOOST_CHECK_THROW(ooe2.read(proto.get()),
+    apache::thrift::protocol::TProtocolException);
+}
+
+BOOST_AUTO_TEST_CASE(test_json_unicode_escaped_missing_hi_surrogate) {
+  const char json_string[] =
+  "{\"1\":{\"tf\":1},\"2\":{\"tf\":0},\"3\":{\"i8\":127},\"4\":{\"i16\":27000},"
+  "\"5\":{\"i32\":16},\"6\":{\"i64\":6000000000},\"7\":{\"dbl\":3.1415926"
+  "53589793},\"8\":{\"str\":\"JSON THIS!\"},\"9\":{\"str\":\"\\udd3e\"},"
+  "\"10\":{\"tf\":0},\"11\":{\"str\":\"000000\"},\"12\":{\"lst\""
+  ":[\"i8\",3,1,2,3]},\"13\":{\"lst\":[\"i16\",3,1,2,3]},\"14\":{\"lst\":[\"i64"
+  "\",3,1,2,3]}}";
+
+  boost::shared_ptr<TMemoryBuffer> buffer(new TMemoryBuffer(
+    (uint8_t*)(json_string), sizeof(json_string)));
+  boost::shared_ptr<TJSONProtocol> proto(new TJSONProtocol(buffer));
+
+  OneOfEach ooe2;
+  BOOST_CHECK_THROW(ooe2.read(proto.get()),
+    apache::thrift::protocol::TProtocolException);
+}