THRIFT-2413: UTF-8 sent by PHP as JSON is not understood by TJsonProtocol
Client: Python
Patch: Phongphan Phuttha
This patch allows readJSONString to decode escaped unicode string including encoded surrogate pair.
This closes #673
diff --git a/lib/py/src/protocol/TJSONProtocol.py b/lib/py/src/protocol/TJSONProtocol.py
index 3ed8bcb..e98f4cf 100644
--- a/lib/py/src/protocol/TJSONProtocol.py
+++ b/lib/py/src/protocol/TJSONProtocol.py
@@ -243,7 +243,14 @@
raise TProtocolException(TProtocolException.INVALID_DATA,
"Unexpected character: %s" % current)
+ def _isHighSurrogate(self, codeunit):
+ return codeunit >= 0xd800 and codeunit <= 0xdbff
+
+ def _isLowSurrogate(self, codeunit):
+ return codeunit >= 0xdc00 and codeunit <= 0xdfff
+
def readJSONString(self, skipContext):
+ highSurrogate = None
string = []
if skipContext is False:
self.context.read()
@@ -255,7 +262,26 @@
if ord(character) == ESCSEQ0:
character = self.reader.read()
if ord(character) == ESCSEQ1:
- character = chr(int(self.trans.read(4)))
+ if sys.version_info[0] == 2:
+ import json
+ character = self.trans.read(4)
+ codeunit = int(character, 16)
+ if self._isHighSurrogate(codeunit):
+ if highSurrogate:
+ raise TProtocolException(TProtocolException.INVALID_DATA,
+ "Expected low surrogate char")
+ highSurrogate = character
+ continue
+ elif self._isLowSurrogate(codeunit):
+ if not highSurrogate:
+ raise TProtocolException(TProtocolException.INVALID_DATA,
+ "Expected high surrogate char")
+ character = json.JSONDecoder().decode('"\\u%s\\u%s"' % (highSurrogate, character)).encode('utf-8')
+ highSurrogate = None
+ else:
+ character = json.JSONDecoder().decode('"\\u%s"' % character).encode('utf-8')
+ else:
+ character = chr(int(self.trans.read(4)))
else:
if character not in ESCAPE_CHARS:
raise TProtocolException(TProtocolException.INVALID_DATA,
@@ -270,6 +296,10 @@
utf8_bytes.append(ord(self.reader.read()))
character = utf8_bytes.decode('utf8')
string.append(character)
+
+ if highSurrogate:
+ raise TProtocolException(TProtocolException.INVALID_DATA,
+ "Expected low surrogate char")
return ''.join(string)
def isJSONNumeric(self, character):
diff --git a/lib/py/test/thrift_json.py b/lib/py/test/thrift_json.py
new file mode 100644
index 0000000..cef8870
--- /dev/null
+++ b/lib/py/test/thrift_json.py
@@ -0,0 +1,31 @@
+from thrift import Thrift
+from thrift.protocol.TJSONProtocol import TJSONProtocol
+from thrift.transport import TTransport
+
+import sys
+import unittest
+
+#
+# In order to run the test under Windows. We need to create symbolic link
+# name 'thrift' to '../src' folder by using:
+#
+# mklink /D thrift ..\src
+#
+
+class TestJSONString(unittest.TestCase):
+
+ def test_escaped_unicode_string(self):
+ unicode_json = '"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab unicode"'
+ unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB unicode'
+
+ buf = TTransport.TMemoryBuffer(unicode_json)
+ transport = TTransport.TBufferedTransportFactory().getTransport(buf)
+ protocol = TJSONProtocol(transport)
+
+ if sys.version_info[0] == 2:
+ unicode_text = unicode_text.encode('utf8')
+ self.assertEqual(protocol.readString(), unicode_text)
+
+if __name__ == '__main__':
+ unittest.main()
+