THRIFT-2413 Add JSON escaped unicode support for python3.
Client: Python
Patch: Phongphan Phuttha
This closes #686
diff --git a/lib/py/src/protocol/TJSONProtocol.py b/lib/py/src/protocol/TJSONProtocol.py
index e98f4cf..d210bff 100644
--- a/lib/py/src/protocol/TJSONProtocol.py
+++ b/lib/py/src/protocol/TJSONProtocol.py
@@ -249,6 +249,21 @@
def _isLowSurrogate(self, codeunit):
return codeunit >= 0xdc00 and codeunit <= 0xdfff
+ def _toChar(self, high, low=None):
+ if not low:
+ if sys.version_info[0] == 2:
+ return ("\\u%04x" % high).decode('unicode-escape').encode('utf-8')
+ else:
+ return chr(high)
+ else:
+ codepoint = (1 << 16) + ((high & 0x3ff) << 10)
+ codepoint += low & 0x3ff
+ if sys.version_info[0] == 2:
+ s = "\\U%08x" % codepoint
+ return s.decode('unicode-escape').encode('utf-8')
+ else:
+ return chr(codepoint)
+
def readJSONString(self, skipContext):
highSurrogate = None
string = []
@@ -262,26 +277,22 @@
if ord(character) == ESCSEQ0:
character = self.reader.read()
if ord(character) == ESCSEQ1:
- if sys.version_info[0] == 2:
- import json
- character = self.trans.read(4)
- codeunit = int(character, 16)
- if self._isHighSurrogate(codeunit):
- if highSurrogate:
- raise TProtocolException(TProtocolException.INVALID_DATA,
- "Expected low surrogate char")
- highSurrogate = character
- continue
- elif self._isLowSurrogate(codeunit):
- if not highSurrogate:
- raise TProtocolException(TProtocolException.INVALID_DATA,
- "Expected high surrogate char")
- character = json.JSONDecoder().decode('"\\u%s\\u%s"' % (highSurrogate, character)).encode('utf-8')
- highSurrogate = None
- else:
- character = json.JSONDecoder().decode('"\\u%s"' % character).encode('utf-8')
+ character = self.trans.read(4).decode('ascii')
+ codeunit = int(character, 16)
+ if self._isHighSurrogate(codeunit):
+ if highSurrogate:
+ raise TProtocolException(TProtocolException.INVALID_DATA,
+ "Expected low surrogate char")
+ highSurrogate = codeunit
+ continue
+ elif self._isLowSurrogate(codeunit):
+ if not highSurrogate:
+ raise TProtocolException(TProtocolException.INVALID_DATA,
+ "Expected high surrogate char")
+ character = self._toChar(highSurrogate, codeunit)
+ highSurrogate = None
else:
- character = chr(int(self.trans.read(4)))
+ character = self._toChar(codeunit)
else:
if character not in ESCAPE_CHARS:
raise TProtocolException(TProtocolException.INVALID_DATA,
diff --git a/lib/py/test/thrift_json.py b/lib/py/test/thrift_json.py
index cef8870..6d6c8fa 100644
--- a/lib/py/test/thrift_json.py
+++ b/lib/py/test/thrift_json.py
@@ -15,8 +15,8 @@
class TestJSONString(unittest.TestCase):
def test_escaped_unicode_string(self):
- unicode_json = '"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab unicode"'
- unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB unicode'
+ unicode_json = b'"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab\\udb40\\udc70 unicode"'
+ unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB\U000E0070 unicode'
buf = TTransport.TMemoryBuffer(unicode_json)
transport = TTransport.TBufferedTransportFactory().getTransport(buf)