THRIFT-2409 UTF-8 sent by PHP as JSON is not understood by TJsonProtocol
Client: Java
Patch: Phongphan Phuttha <phongphan@acm.org>
This closes #667
diff --git a/lib/java/src/org/apache/thrift/protocol/TJSONProtocol.java b/lib/java/src/org/apache/thrift/protocol/TJSONProtocol.java
index 9876e13..12341ab 100644
--- a/lib/java/src/org/apache/thrift/protocol/TJSONProtocol.java
+++ b/lib/java/src/org/apache/thrift/protocol/TJSONProtocol.java
@@ -19,8 +19,10 @@
package org.apache.thrift.protocol;
+import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
+import java.util.ArrayList;
import java.util.Stack;
import org.apache.thrift.TByteArrayOutputStream;
@@ -640,6 +642,7 @@
private TByteArrayOutputStream readJSONString(boolean skipContext)
throws TException {
TByteArrayOutputStream arr = new TByteArrayOutputStream(DEF_STRING_SIZE);
+ ArrayList<Character> codeunits = new ArrayList<Character>();
if (!skipContext) {
context_.read();
}
@@ -652,10 +655,43 @@
if (ch == ESCSEQ[0]) {
ch = reader_.read();
if (ch == ESCSEQ[1]) {
- readJSONSyntaxChar(ZERO);
- readJSONSyntaxChar(ZERO);
- trans_.readAll(tmpbuf_, 0, 2);
- ch = (byte)((hexVal((byte)tmpbuf_[0]) << 4) + hexVal(tmpbuf_[1]));
+ trans_.readAll(tmpbuf_, 0, 4);
+ short cu = (short)(
+ ((short)hexVal(tmpbuf_[0]) << 12) +
+ ((short)hexVal(tmpbuf_[1]) << 8) +
+ ((short)hexVal(tmpbuf_[2]) << 4) +
+ (short)hexVal(tmpbuf_[3]));
+ try {
+ if (Character.isHighSurrogate((char)cu)) {
+ if (codeunits.size() > 0) {
+ throw new TProtocolException(TProtocolException.INVALID_DATA,
+ "Expected low surrogate char");
+ }
+ codeunits.add((char)cu);
+ }
+ else if (Character.isLowSurrogate((char)cu)) {
+ if (codeunits.size() == 0) {
+ throw new TProtocolException(TProtocolException.INVALID_DATA,
+ "Expected high surrogate char");
+ }
+
+ codeunits.add((char)cu);
+ arr.write((new String(new int[] { codeunits.get(0), codeunits.get(1) }, 0, 2)).getBytes("UTF-8"));
+ codeunits.clear();
+ }
+ else {
+ arr.write((new String(new int[] { cu }, 0, 1)).getBytes("UTF-8"));
+ }
+ continue;
+ }
+ catch (UnsupportedEncodingException ex) {
+ throw new TProtocolException(TProtocolException.NOT_IMPLEMENTED,
+ "JVM does not support UTF-8");
+ }
+ catch (IOException ex) {
+ throw new TProtocolException(TProtocolException.INVALID_DATA,
+ "Invalid unicode sequence");
+ }
}
else {
int off = ESCAPE_CHARS.indexOf(ch);
diff --git a/lib/java/test/org/apache/thrift/protocol/TestTJSONProtocol.java b/lib/java/test/org/apache/thrift/protocol/TestTJSONProtocol.java
index d7376ac..1320749 100644
--- a/lib/java/test/org/apache/thrift/protocol/TestTJSONProtocol.java
+++ b/lib/java/test/org/apache/thrift/protocol/TestTJSONProtocol.java
@@ -18,6 +18,12 @@
*/
package org.apache.thrift.protocol;
+import java.io.IOException;
+
+import org.apache.thrift.TException;
+import org.apache.thrift.protocol.TJSONProtocol;
+import org.apache.thrift.transport.TMemoryBuffer;
+
public class TestTJSONProtocol extends ProtocolTestBase {
@Override
protected TProtocolFactory getFactory() {
@@ -28,4 +34,15 @@
protected boolean canBeUsedNaked() {
return false;
}
+
+ public void testEscapedUnicode() throws TException, IOException {
+ String jsonString = "\"hello unicode \\u0e01\\ud834\\udd1e world\"";
+ String expectedString = "hello unicode \u0e01\ud834\udd1e world";
+
+ TMemoryBuffer buffer = new TMemoryBuffer(1000);
+ TJSONProtocol protocol = new TJSONProtocol(buffer);
+ buffer.write(jsonString.getBytes("UTF-8"));
+
+ assertEquals(expectedString, protocol.readString());
+ }
}