THRIFT-765. java: Improved string encoding and decoding performance

This change makes Java's string/utf8 encoding and decoding about 2x faster.

git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@937812 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/java/src/org/apache/thrift/Utf8Helper.java b/lib/java/src/org/apache/thrift/Utf8Helper.java
new file mode 100644
index 0000000..e754517
--- /dev/null
+++ b/lib/java/src/org/apache/thrift/Utf8Helper.java
@@ -0,0 +1,86 @@
+package org.apache.thrift;
+
+public final class Utf8Helper {
+  private Utf8Helper() {}
+
+  public static final int getByteLength(final String s) {
+    int byteLength = 0;
+    int c;
+    for (int i = 0; i < s.length(); i++) {
+      c = s.charAt(i);
+      if (c <= 0x007F) {
+        byteLength++;
+      } else if (c > 0x07FF) {
+        byteLength+=3;
+      } else {
+        byteLength+=2;
+      }
+    }
+    return byteLength;
+  }
+
+  public static byte[] encode(String s) {
+    byte[] buf = new byte[getByteLength(s)];
+    encode(s, buf, 0);
+    return buf;
+  }
+
+  public static void encode(String s, byte[] buf, int offset) {
+    int nextByte = 0;
+    int c;
+    for (int i = 0; i < s.length(); i++) {
+      c = s.charAt(i);
+      if (c <= 0x007F) {
+        buf[offset + nextByte] = (byte)c;
+        nextByte++;
+      } else if (c > 0x07FF) {
+        buf[offset + nextByte    ] = (byte)(0xE0 | c >> 12 & 0x0F);
+        buf[offset + nextByte + 1] = (byte)(0x80 | c >>  6 & 0x3F);
+        buf[offset + nextByte + 2] = (byte)(0x80 | c       & 0x3F);
+        nextByte+=3;
+      } else {
+        buf[offset + nextByte    ] = (byte)(0xC0 | c >> 6 & 0x1F);
+        buf[offset + nextByte + 1] = (byte)(0x80 | c      & 0x3F);
+        nextByte+=2;
+      }
+    }
+  }
+
+  public static String decode(byte[] buf) {
+    return decode(buf, 0, buf.length);
+  }
+
+  public static String decode(byte[] buf, int offset, int byteLength) {
+    int charCount = 0;
+    char[] chars = new char[byteLength];
+    int c;
+    int byteIndex = offset;
+    int charIndex = 0;
+    while (byteIndex < offset + byteLength) {
+      c = buf[byteIndex++] & 0xFF;
+      switch (c >> 4) {
+        case 0:
+        case 1:
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 6:
+        case 7:
+          chars[charIndex++] = (char) c;
+          break;
+        case 12:
+        case 13:
+          chars[charIndex++] = (char) ((c & 0x1F) << 6 | (buf[byteIndex++] & 0x3F));
+          break;
+        case 14:
+          chars[charIndex++] = (char) ((c & 0x0F) << 12 | (buf[byteIndex++] & 0x3F) << 6 | (buf[byteIndex++] & 0x3F) << 0);
+          break;
+      }
+      charCount++;
+    }
+    return new String(chars, 0, charCount);
+
+  }
+  
+}
diff --git a/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java b/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
index 16c7567..3b4453d 100644
--- a/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
+++ b/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
@@ -19,9 +19,8 @@
 
 package org.apache.thrift.protocol;
 
-import java.io.UnsupportedEncodingException;
-
 import org.apache.thrift.TException;
+import org.apache.thrift.Utf8Helper;
 import org.apache.thrift.transport.TTransport;
 
 /**
@@ -170,13 +169,9 @@
   }
 
   public void writeString(String str) throws TException {
-    try {
-      byte[] dat = str.getBytes("UTF-8");
-      writeI32(dat.length);
-      trans_.write(dat, 0, dat.length);
-    } catch (UnsupportedEncodingException uex) {
-      throw new TException("JVM DOES NOT SUPPORT UTF-8");
-    }
+    byte[] dat = Utf8Helper.encode(str);
+    writeI32(dat.length);
+    trans_.write(dat, 0, dat.length);
   }
 
   public void writeBinary(byte[] bin) throws TException {
@@ -323,27 +318,19 @@
     int size = readI32();
 
     if (trans_.getBytesRemainingInBuffer() >= size) {
-      try {
-        String s = new String(trans_.getBuffer(), trans_.getBufferPosition(), size, "UTF-8");
-        trans_.consumeBuffer(size);
-        return s;
-      } catch (UnsupportedEncodingException e) {
-        throw new TException("JVM DOES NOT SUPPORT UTF-8");
-      }
+      String s = Utf8Helper.decode(trans_.getBuffer(), trans_.getBufferPosition(), size);
+      trans_.consumeBuffer(size);
+      return s;
     }
 
     return readStringBody(size);
   }
 
   public String readStringBody(int size) throws TException {
-    try {
-      checkReadLength(size);
-      byte[] buf = new byte[size];
-      trans_.readAll(buf, 0, size);
-      return new String(buf, "UTF-8");
-    } catch (UnsupportedEncodingException uex) {
-      throw new TException("JVM DOES NOT SUPPORT UTF-8");
-    }
+    checkReadLength(size);
+    byte[] buf = new byte[size];
+    trans_.readAll(buf, 0, size);
+    return Utf8Helper.decode(buf);
   }
 
   public byte[] readBinary() throws TException {
diff --git a/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java b/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
index f497942..f50ef1b 100755
--- a/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
+++ b/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
@@ -20,10 +20,9 @@
 
 package org.apache.thrift.protocol;
 
-import java.io.UnsupportedEncodingException;
-
 import org.apache.thrift.ShortStack;
 import org.apache.thrift.TException;
+import org.apache.thrift.Utf8Helper;
 import org.apache.thrift.transport.TTransport;
 
 /**
@@ -293,11 +292,7 @@
    * Write a string to the wire with a varint size preceeding.
    */
   public void writeString(String str) throws TException {
-    try {
-      writeBinary(str.getBytes("UTF-8"));
-    } catch (UnsupportedEncodingException e) {
-      throw new TException("UTF-8 not supported!");
-    }
+    writeBinary(Utf8Helper.encode(str));
   }
 
   /**
@@ -610,16 +605,12 @@
       return "";
     }
 
-    try {
-      if (trans_.getBytesRemainingInBuffer() >= length) {
-        String str = new String(trans_.getBuffer(), trans_.getBufferPosition(), length, "UTF-8");
-        trans_.consumeBuffer(length);
-        return str;
-      } else {
-        return new String(readBinary(length), "UTF-8");
-      }
-    } catch (UnsupportedEncodingException e) {
-      throw new TException("UTF-8 not supported!");
+    if (trans_.getBytesRemainingInBuffer() >= length) {
+      String str = Utf8Helper.decode(trans_.getBuffer(), trans_.getBufferPosition(), length);
+      trans_.consumeBuffer(length);
+      return str;
+    } else {
+      return Utf8Helper.decode(readBinary(length));
     }
   }
 
diff --git a/lib/java/test/org/apache/thrift/TestUtf8Helper.java b/lib/java/test/org/apache/thrift/TestUtf8Helper.java
new file mode 100644
index 0000000..9d04d5a
--- /dev/null
+++ b/lib/java/test/org/apache/thrift/TestUtf8Helper.java
@@ -0,0 +1,58 @@
+package org.apache.thrift;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+public class TestUtf8Helper extends TestCase {
+  private static final String NON_UNICODE_STRING = "here's some text";
+
+  private static final byte[] kUnicodeBytes = {
+    (byte)0xd3, (byte)0x80, (byte)0xe2, (byte)0x85, (byte)0xae, (byte)0xce,
+    (byte)0x9d, (byte)0x20, (byte)0xd0, (byte)0x9d, (byte)0xce, (byte)0xbf,
+    (byte)0xe2, (byte)0x85, (byte)0xbf, (byte)0xd0, (byte)0xbe, (byte)0xc9,
+    (byte)0xa1, (byte)0xd0, (byte)0xb3, (byte)0xd0, (byte)0xb0, (byte)0xcf,
+    (byte)0x81, (byte)0xe2, (byte)0x84, (byte)0x8e, (byte)0x20, (byte)0xce,
+    (byte)0x91, (byte)0x74, (byte)0x74, (byte)0xce, (byte)0xb1, (byte)0xe2,
+    (byte)0x85, (byte)0xbd, (byte)0xce, (byte)0xba, (byte)0x83, (byte)0xe2,
+    (byte)0x80, (byte)0xbc
+  };
+
+  private static final String UNICODE_STRING = "abc\u5639\u563b";
+  private static final byte[] UNICODE_STRING_BYTES;
+
+  private static final String UNICODE_STRING_2;
+  private static final byte[] UNICODE_STRING_BYTES_2;
+
+  static {
+    try {
+      UNICODE_STRING_BYTES = UNICODE_STRING.getBytes("UTF-8");
+      UNICODE_STRING_2 = new String(kUnicodeBytes, "UTF-8");
+      UNICODE_STRING_BYTES_2 = UNICODE_STRING_2.getBytes("UTF-8");
+    } catch (UnsupportedEncodingException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+
+  public void testEncode() throws Exception {
+    byte[] bytes = NON_UNICODE_STRING.getBytes("UTF-8");
+    byte[] otherBytes = Utf8Helper.encode(NON_UNICODE_STRING);
+    assertTrue(Arrays.equals(bytes, otherBytes));
+
+    otherBytes = Utf8Helper.encode(UNICODE_STRING);
+    assertTrue(Arrays.equals(UNICODE_STRING_BYTES, otherBytes));
+
+    otherBytes = Utf8Helper.encode(UNICODE_STRING_2);
+    assertTrue(Arrays.equals(UNICODE_STRING_BYTES_2, otherBytes));
+  }
+
+  public void testDecode() throws Exception {
+    byte[] bytes = NON_UNICODE_STRING.getBytes("UTF-8");
+    assertEquals(NON_UNICODE_STRING, Utf8Helper.decode(bytes));
+
+    assertEquals(UNICODE_STRING, Utf8Helper.decode(UNICODE_STRING_BYTES));
+    assertEquals(UNICODE_STRING_2, Utf8Helper.decode(UNICODE_STRING_BYTES_2));
+  }
+}