THRIFT-2413 Add JSON escaped unicode support for python3.
Client: Python
Patch: Phongphan Phuttha

This closes #686
diff --git a/lib/py/src/protocol/TJSONProtocol.py b/lib/py/src/protocol/TJSONProtocol.py
index e98f4cf..d210bff 100644
--- a/lib/py/src/protocol/TJSONProtocol.py
+++ b/lib/py/src/protocol/TJSONProtocol.py
@@ -249,6 +249,21 @@
   def _isLowSurrogate(self, codeunit):
     return codeunit >= 0xdc00 and codeunit <= 0xdfff
 
+  def _toChar(self, high, low=None):
+    if not low:
+      if sys.version_info[0] == 2:
+        return ("\\u%04x" % high).decode('unicode-escape').encode('utf-8')
+      else:
+        return chr(high)
+    else:
+      codepoint = (1 << 16) + ((high & 0x3ff) << 10)
+      codepoint += low & 0x3ff
+      if sys.version_info[0] == 2:
+        s = "\\U%08x" % codepoint
+        return s.decode('unicode-escape').encode('utf-8')
+      else:
+        return chr(codepoint)
+
   def readJSONString(self, skipContext):
     highSurrogate = None
     string = []
@@ -262,26 +277,22 @@
       if ord(character) == ESCSEQ0:
         character = self.reader.read()
         if ord(character) == ESCSEQ1:
-          if sys.version_info[0] == 2:
-            import json
-            character = self.trans.read(4)
-            codeunit = int(character, 16)
-            if self._isHighSurrogate(codeunit):
-              if highSurrogate:
-                raise TProtocolException(TProtocolException.INVALID_DATA,
-                                         "Expected low surrogate char")
-              highSurrogate = character
-              continue
-            elif self._isLowSurrogate(codeunit):
-              if not highSurrogate:
-                raise TProtocolException(TProtocolException.INVALID_DATA,
-                                         "Expected high surrogate char")
-              character = json.JSONDecoder().decode('"\\u%s\\u%s"' % (highSurrogate, character)).encode('utf-8')
-              highSurrogate = None
-            else:
-              character = json.JSONDecoder().decode('"\\u%s"' % character).encode('utf-8')
+          character = self.trans.read(4).decode('ascii')
+          codeunit = int(character, 16)
+          if self._isHighSurrogate(codeunit):
+            if highSurrogate:
+              raise TProtocolException(TProtocolException.INVALID_DATA,
+                                       "Expected low surrogate char")
+            highSurrogate = codeunit
+            continue
+          elif self._isLowSurrogate(codeunit):
+            if not highSurrogate:
+              raise TProtocolException(TProtocolException.INVALID_DATA,
+                                       "Expected high surrogate char")
+            character = self._toChar(highSurrogate, codeunit)
+            highSurrogate = None
           else:
-              character = chr(int(self.trans.read(4)))
+            character = self._toChar(codeunit)
         else:
           if character not in ESCAPE_CHARS:
             raise TProtocolException(TProtocolException.INVALID_DATA,
diff --git a/lib/py/test/thrift_json.py b/lib/py/test/thrift_json.py
index cef8870..6d6c8fa 100644
--- a/lib/py/test/thrift_json.py
+++ b/lib/py/test/thrift_json.py
@@ -15,8 +15,8 @@
 class TestJSONString(unittest.TestCase):
 
   def test_escaped_unicode_string(self):
-    unicode_json = '"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab unicode"'
-    unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB unicode'
+    unicode_json = b'"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab\\udb40\\udc70 unicode"'
+    unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB\U000E0070 unicode'
 
     buf = TTransport.TMemoryBuffer(unicode_json)
     transport = TTransport.TBufferedTransportFactory().getTransport(buf)